LongVU

Runtime error

App Files Files Community

Vision-CAIR commited on 25 days ago

Commit

85efb5b

•

1 Parent(s): 679922c

Upload 39 files

Browse files

Files changed (40) hide show

.gitattributes +2 -0
app.py +317 -0
examples/video1.mp4 +3 -0
examples/video2.mp4 +3 -0
inference.py +94 -0
longvu/.DS_Store +0 -0
longvu/__init__.py +3 -0
longvu/apply_delta.py +59 -0
longvu/builder.py +249 -0
longvu/cambrian_arch.py +1705 -0
longvu/consolidate.py +33 -0
longvu/constants.py +13 -0
longvu/conversation.py +606 -0
longvu/file_io.py +11 -0
longvu/language_model/__pycache__/cambrian_llama.cpython-310.pyc +0 -0
longvu/language_model/__pycache__/cambrian_qwen.cpython-310.pyc +0 -0
longvu/language_model/cambrian_llama.py +546 -0
longvu/language_model/cambrian_qwen.py +471 -0
longvu/make_delta.py +66 -0
longvu/mm_datautils.py +1688 -0
longvu/mm_utils.py +327 -0
longvu/multimodal_encoder/__pycache__/base_encoder.cpython-310.pyc +0 -0
longvu/multimodal_encoder/__pycache__/builder.cpython-310.pyc +0 -0
longvu/multimodal_encoder/__pycache__/dino_encoder.cpython-310.pyc +0 -0
longvu/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc +0 -0
longvu/multimodal_encoder/base_encoder.py +135 -0
longvu/multimodal_encoder/builder.py +37 -0
longvu/multimodal_encoder/dino_encoder.py +131 -0
longvu/multimodal_encoder/drop.py +41 -0
longvu/multimodal_encoder/image.py +80 -0
longvu/multimodal_encoder/logging.py +131 -0
longvu/multimodal_encoder/loss.py +96 -0
longvu/multimodal_encoder/registry.py +56 -0
longvu/multimodal_encoder/siglip_encoder.py +78 -0
longvu/multimodal_encoder/utils.py +66 -0
longvu/multimodal_projector/__pycache__/builder.cpython-310.pyc +0 -0
longvu/multimodal_projector/builder.py +52 -0
longvu/utils.py +25 -0
longvu/vision_sampler.py +566 -0
requirements.txt +28 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/video1.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/video2.mp4 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import spaces
+import os
+import re
+import traceback
+import torch
+import gradio as gr
+import sys
+import numpy as np
+from longvu.builder import load_pretrained_model
+from longvu.constants import (
+    DEFAULT_IMAGE_TOKEN,
+    IMAGE_TOKEN_INDEX,
+)
+from longvu.conversation import conv_templates, SeparatorStyle
+from longvu.mm_datautils import (
+    KeywordsStoppingCriteria,
+    process_images,
+    tokenizer_image_token,
+)
+from decord import cpu, VideoReader
+title_markdown = ("""
+LongVU
+""")
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+    color: #9C276A
+}
+"""
+plum_color = gr.themes.colors.Color(
+    name='plum',
+    c50='#F8E4EF',
+    c100='#E9D0DE',
+    c200='#DABCCD',
+    c300='#CBA8BC',
+    c400='#BC94AB',
+    c500='#AD809A',
+    c600='#9E6C89',
+    c700='#8F5878',
+    c800='#804467',
+    c900='#713056',
+    c950='#662647',
+)
+class Chat:
+    def __init__(self):
+        self.version = "qwen"
+        model_name = "cambrian_qwen"
+        model_path = "./checkpoints/longvu_qwen"
+        device = "cuda:7"
+        self.tokenizer, self.model, self.processor, _ = load_pretrained_model(model_path,  None, model_name, device=device)
+        self.model.eval()
+    def remove_after_last_dot(self, s):
+        last_dot_index = s.rfind('.')
+        if last_dot_index == -1:
+            return s
+        return s[:last_dot_index + 1]
+    @spaces.GPU(duration=120)
+    @torch.inference_mode()
+    def generate(self, data: list, message, temperature, top_p, max_output_tokens):
+        # TODO: support multiple turns of conversation.
+        assert len(data) == 1
+        tensor, image_sizes, modal = data[0]
+        conv = conv_templates[self.version].copy()
+        if isinstance(message, str):
+            conv.append_message("user", DEFAULT_IMAGE_TOKEN + '\n' + message)
+        elif isinstance(message, list):
+            if DEFAULT_IMAGE_TOKEN not in message[0]['content']:
+                message[0]['content'] = DEFAULT_IMAGE_TOKEN + '\n' + message[0]['content']
+            for mes in message:
+                conv.append_message(mes["role"], mes["content"])
+        conv.append_message("assistant", None)
+        prompt = conv.get_prompt()
+        input_ids = (
+            tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+            .unsqueeze(0)
+            .to(self.model.device)
+        )
+        if "llama3" in self.version:
+            input_ids = input_ids[0][1:].unsqueeze(0)  # remove bos
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids,
+                images=tensor,
+                image_sizes=image_sizes,
+                do_sample=True,
+                temperature=temperature,
+                max_new_tokens=max_output_tokens,
+                use_cache=True,
+                top_p=top_p,
+                stopping_criteria=[stopping_criteria],
+            )
+        pred = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        return self.remove_after_last_dot(pred)
+@spaces.GPU(duration=120)
+def generate(image, video, message, chatbot, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16):
+    data = []
+    processor = handler.processor
+    try:
+        if image is not None:
+            data.append((processor['image'](image).to(handler.model.device, dtype=dtype), None, '<image>'))
+        elif video is not None:
+            vr = VideoReader(video, ctx=cpu(0), num_threads=1)
+            fps = float(vr.get_avg_fps())
+            frame_indices = np.array(
+                [
+                    i
+                    for i in range(
+                        0,
+                        len(vr),
+                        round(fps),
+                    )
+                ]
+            )
+            video_tensor = []
+            for frame_index in frame_indices:
+                img = vr[frame_index].asnumpy()
+                video_tensor.append(img)
+            video_tensor = np.stack(video_tensor)
+            image_sizes = [video_tensor[0].shape[:2]]
+            video_tensor = process_images(video_tensor, processor, handler.model.config)
+            video_tensor = [item.unsqueeze(0).to(handler.model.device, dtype=dtype) for item in video_tensor]
+            data.append((video_tensor, image_sizes, '<video>'))
+        elif image is None and video is None:
+            data.append((None, None, '<text>'))
+        else:
+            raise NotImplementedError("Not support image and video at the same time")
+    except Exception as e:
+        traceback.print_exc()
+        return gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), message, chatbot
+    assert len(message) % 2 == 0, "The message should be a pair of user and system message."
+    show_images = ""
+    if image is not None:
+        show_images += f'<img src="./file={image}" style="display: inline-block;width: 250px;max-height: 400px;">'
+    if video is not None:
+        show_images += f'<video controls playsinline width="300" style="display: inline-block;"  src="./file={video}"></video>'
+    one_turn_chat = [textbox_in, None]
+    # 1. first run case
+    if len(chatbot) == 0:
+        one_turn_chat[0] += "\n" + show_images
+    # 2. not first run case
+    else:
+        # scanning the last image or video
+        length = len(chatbot)
+        for i in range(length - 1, -1, -1):
+            previous_image = re.findall(r'<img src="./file=(.+?)"', chatbot[i][0])
+            previous_video = re.findall(r'<video controls playsinline width="500" style="display: inline-block;"  src="./file=(.+?)"', chatbot[i][0])
+            if len(previous_image) > 0:
+                previous_image = previous_image[-1]
+                # 2.1 new image append or pure text input will start a new conversation
+                if (video is not None) or (image is not None and os.path.basename(previous_image) != os.path.basename(image)):
+                    message.clear()
+                    one_turn_chat[0] += "\n" + show_images
+                break
+            elif len(previous_video) > 0:
+                previous_video = previous_video[-1]
+                # 2.2 new video append or pure text input will start a new conversation
+                if image is not None or (video is not None and os.path.basename(previous_video) != os.path.basename(video)):
+                    message.clear()
+                    one_turn_chat[0] += "\n" + show_images
+                break
+    message.append({'role': 'user', 'content': textbox_in})
+    text_en_out = handler.generate(data, message, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
+    message.append({'role': 'assistant', 'content': text_en_out})
+    one_turn_chat[1] = text_en_out
+    chatbot.append(one_turn_chat)
+    return gr.update(value=image, interactive=True), gr.update(value=video, interactive=True), message, chatbot
+def regenerate(message, chatbot):
+    message.pop(-1), message.pop(-1)
+    chatbot.pop(-1)
+    return message, chatbot
+def clear_history(message, chatbot):
+    message.clear(), chatbot.clear()
+    return (gr.update(value=None, interactive=True),
+            gr.update(value=None, interactive=True),
+            message, chatbot,
+            gr.update(value=None, interactive=True))
+handler = Chat()
+textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
+theme = gr.themes.Default(primary_hue=plum_color)
+# theme.update_color("primary", plum_color.c500)
+theme.set(slider_color="#9C276A")
+theme.set(block_title_text_color="#9C276A")
+theme.set(block_label_text_color="#9C276A")
+theme.set(button_primary_text_color="#9C276A")
+with gr.Blocks(title='LongVU', theme=theme, css=block_css) as demo:
+    gr.Markdown(title_markdown)
+    message = gr.State([])
+    with gr.Row():
+        with gr.Column(scale=3):
+            image = gr.State(None)
+            video = gr.Video(label="Input Video")
+            with gr.Accordion("Parameters", open=True) as parameter_row:
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.2,
+                    step=0.1,
+                    interactive=True,
+                    label="Temperature",
+                )
+                top_p = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.7,
+                        step=0.1,
+                        interactive=True,
+                        label="Top P",
+                )
+                max_output_tokens = gr.Slider(
+                    minimum=64,
+                    maximum=512,
+                    value=128,
+                    step=64,
+                    interactive=True,
+                    label="Max output tokens",
+                )
+        with gr.Column(scale=7):
+            chatbot = gr.Chatbot(label="LongVU", bubble_full_width=True, height=420)
+            with gr.Row():
+                with gr.Column(scale=8):
+                    textbox.render()
+                with gr.Column(scale=1, min_width=50):
+                    submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
+            with gr.Row(elem_id="buttons") as button_row:
+                upvote_btn     = gr.Button(value="👍  Upvote", interactive=True)
+                downvote_btn   = gr.Button(value="👎  Downvote", interactive=True)
+                regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
+                clear_btn      = gr.Button(value="🗑️  Clear history", interactive=True)
+    with gr.Row():
+        with gr.Column():
+            gr.Examples(
+                examples=[
+                    [
+                        f"./examples/video1.mp4",
+                        "Describe this video in detail.",
+                    ],
+                    [
+                        f"./examples/video2.mp4",
+                        "Which country does the boy in the video probably come from?",
+                    ]
+                ],
+                inputs=[video, textbox],
+            )
+    submit_btn.click(
+        generate,
+        [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens],
+        [image, video, message, chatbot])
+    regenerate_btn.click(
+        regenerate,
+        [message, chatbot],
+        [message, chatbot]).then(
+        generate,
+        [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens],
+        [image, video, message, chatbot])
+    clear_btn.click(
+        clear_history,
+        [message, chatbot],
+        [image, video, message, chatbot, textbox])
+demo.launch()

examples/video1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0431514403beba3c269a318b4e5eb6c08cd6940edb10d5014b4745c5fee31ac0
+size 1171735

examples/video2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3069129741c1ed79524a3eeb138ce4f99a9b7fedf36652afd8ad7d83b1d6008b
+size 1606730

inference.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import numpy as np
+import torch
+from longvu.builder import load_pretrained_model
+from longvu.constants import (
+    DEFAULT_IMAGE_TOKEN,
+    IMAGE_TOKEN_INDEX,
+)
+from longvu.conversation import conv_templates, SeparatorStyle
+from longvu.mm_datautils import (
+    KeywordsStoppingCriteria,
+    process_images,
+    tokenizer_image_token,
+)
+from decord import cpu, VideoReader
+version = "qwen"
+model_name = "cambrian_qwen"
+input_model_local_path = "./checkpoints/longvu_qwen"
+device = "cuda:7"
+tokenizer, model, image_processor, context_len = load_pretrained_model(
+    input_model_local_path, None, model_name, device=device
+)
+model.get_model().config.tokenizer_model_max_length = 8192
+model.get_model().config.inference_max_length = 128
+model.config.use_cache = True
+print(model.device)
+model.eval()
+video_path = "./examples/video1.mp4"
+qs = "Describe this video in detail"
+vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+fps = float(vr.get_avg_fps())
+frame_indices = np.array(
+    [
+        i
+        for i in range(
+            0,
+            len(vr),
+            round(fps),
+        )
+    ]
+)
+video = []
+for frame_index in frame_indices:
+    img = vr[frame_index].asnumpy()
+    video.append(img)
+video = np.stack(video)
+image_sizes = [video[0].shape[:2]]
+video = process_images(video, image_processor, model.config)
+video = [item.unsqueeze(0) for item in video]
+qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+conv = conv_templates[version].copy()
+conv.append_message(conv.roles[0], qs)
+conv.append_message(conv.roles[1], None)
+prompt = conv.get_prompt()
+input_ids = (
+    tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+    .unsqueeze(0)
+    .to(model.device)
+)
+if "llama3" in version:
+    input_ids = input_ids[0][1:].unsqueeze(0)  # remove bos
+stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+keywords = [stop_str]
+stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+with torch.inference_mode():
+    output_ids = model.generate(
+        input_ids,
+        images=video,
+        image_sizes=image_sizes,
+        do_sample=False,
+        temperature=0.2,
+        max_new_tokens=128,
+        use_cache=True,
+        stopping_criteria=[stopping_criteria],
+    )
+if isinstance(output_ids, tuple):
+    output_ids = output_ids[0]
+pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+print("pred: ", pred, flush=True)

longvu/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

longvu/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# pyre-unsafe
+from .language_model.cambrian_qwen import CambrianQwenModel
+from .language_model.cambrian_llama import CambrianLlamaModel

longvu/apply_delta.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# pyre-unsafe
+"""
+Usage:
+python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
+"""
+import argparse
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from . import LlavaLlamaForCausalLM
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    print("Loading delta")
+    delta = LlavaLlamaForCausalLM.from_pretrained(
+        delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
+    print("Applying delta")
+    for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
+        if name not in base.state_dict():
+            assert name in [
+                "model.mm_projector.weight",
+                "model.mm_projector.bias",
+            ], f"{name} not in base model"
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data += base.state_dict()[name]
+        else:
+            assert name in [
+                "model.embed_tokens.weight",
+                "lm_head.weight",
+            ], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
+            bparam = base.state_dict()[name]
+            param.data[: bparam.shape[0], : bparam.shape[1]] += bparam
+    print("Saving target model")
+    delta.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    args = parser.parse_args()
+    apply_delta(args.base_model_path, args.target_model_path, args.delta_path)

longvu/builder.py ADDED Viewed

	@@ -0,0 +1,249 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# pyre-unsafe
+import os
+import shutil
+import warnings
+import torch
+from longvu.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_PATCH_TOKEN,
+)
+from longvu.language_model.cambrian_llama import CambrianLlamaForCausalLM
+from longvu.language_model.cambrian_qwen import CambrianQwenForCausalLM
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+)
+def load_pretrained_model(
+    model_path,
+    model_base,
+    model_name,
+    load_8bit=False,
+    load_4bit=False,
+    device_map="auto",
+    device="cuda",
+    use_flash_attn=False,
+    model_args=None,
+    **kwargs,
+):
+    kwargs = {"device_map": device_map, **kwargs}
+    if device != "cuda":
+        kwargs["device_map"] = {"": device}
+    if load_8bit:
+        kwargs["load_in_8bit"] = True
+    elif load_4bit:
+        kwargs["load_in_4bit"] = True
+        kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+    else:
+        kwargs["torch_dtype"] = torch.float16
+    if use_flash_attn:
+        kwargs["attn_implementation"] = "flash_attention_2"
+    if "cambrian" in model_name.lower():
+        # Load Cambrian model
+        if "lora" in model_name.lower() and model_base is None:
+            warnings.warn(
+                "There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged."
+            )
+        if "lora" in model_name.lower() and model_base is not None:
+            # pyre-fixme[21]: Could not find module
+            #  `core_ai.llava.language_model.cambrian_llama`.
+            from core_ai.llava.language_model.cambrian_llama import CambrianConfig
+            lora_cfg_pretrained = CambrianConfig.from_pretrained(model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            print("Loading Cambrian from base model...")
+            model = CambrianLlamaForCausalLM.from_pretrained(
+                model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs
+            )
+            token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+            if model.lm_head.weight.shape[0] != token_num:
+                model.lm_head.weight = torch.nn.Parameter(
+                    torch.empty(
+                        token_num, tokem_dim, device=model.device, dtype=model.dtype
+                    )
+                )
+                model.model.embed_tokens.weight = torch.nn.Parameter(
+                    torch.empty(
+                        token_num, tokem_dim, device=model.device, dtype=model.dtype
+                    )
+                )
+            print("Loading additional Cambrian weights...")
+            if os.path.exists(os.path.join(model_path, "non_lora_trainables.bin")):
+                non_lora_trainables = torch.load(
+                    os.path.join(model_path, "non_lora_trainables.bin"),
+                    map_location="cpu",
+                )
+            else:
+                # this is probably from HF Hub
+                from huggingface_hub import hf_hub_download
+                def load_from_hf(repo_id, filename, subfolder=None):
+                    cache_file = hf_hub_download(
+                        repo_id=repo_id, filename=filename, subfolder=subfolder
+                    )
+                    return torch.load(cache_file, map_location="cpu")
+                non_lora_trainables = load_from_hf(
+                    model_path, "non_lora_trainables.bin"
+                )
+            non_lora_trainables = {
+                (k[11:] if k.startswith("base_model.") else k): v
+                for k, v in non_lora_trainables.items()
+            }
+            if any(k.startswith("model.model.") for k in non_lora_trainables):
+                non_lora_trainables = {
+                    (k[6:] if k.startswith("model.") else k): v
+                    for k, v in non_lora_trainables.items()
+                }
+            model.load_state_dict(non_lora_trainables, strict=False)
+            from peft import PeftModel
+            print("Loading LoRA weights...")
+            model = PeftModel.from_pretrained(model, model_path)
+            print("Merging LoRA weights...")
+            model = model.merge_and_unload()
+            print("Model is loaded...")
+        elif model_base is not None:
+            # this may be mm projector only
+            print(f"Loading Cambrian-1 from base model... {model_base}")
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            cfg_pretrained = AutoConfig.from_pretrained(model_path)
+            model = CambrianLlamaForCausalLM.from_pretrained(
+                model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs
+            )
+            mm_projector_weights = torch.load(
+                os.path.join(model_path, "mm_projector.bin"), map_location="cpu"
+            )
+            mm_projector_weights = {
+                k: v.to(torch.float16) for k, v in mm_projector_weights.items()
+            }
+            model.load_state_dict(mm_projector_weights, strict=False)
+        else:
+            if "qwen" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = CambrianQwenForCausalLM.from_pretrained(
+                    model_path, low_cpu_mem_usage=True, **kwargs
+                )
+            else:
+                print(f"Loading Cambrian from {model_path}")
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = CambrianLlamaForCausalLM.from_pretrained(
+                    model_path, low_cpu_mem_usage=True, **kwargs
+                )
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_base, low_cpu_mem_usage=True, **kwargs
+            )
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print("Convert to FP16...")
+            model.to(torch.float16)
+        else:
+            use_fast = False
+            if "mpt" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs
+                )
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_path, low_cpu_mem_usage=True, **kwargs
+                )
+    image_processor = None
+    if "llava" in model_name.lower():
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
+        model.resize_token_embeddings(len(tokenizer))
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            try:
+                vision_tower.load_model(device_map=device_map)
+            except ValueError:
+                # ClipVisionTower doesn't support loading with device_map 'auto'
+                vision_tower.load_model()
+                vision_tower.to(device="cuda", dtype=torch.float16)
+        if device_map != "auto":
+            vision_tower.to(device=device_map, dtype=torch.float16)
+        image_processor = vision_tower.image_processor
+    elif "cambrian" in model_name.lower():
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
+        model.resize_token_embeddings(len(tokenizer))
+        vision_tower_aux_list = model.get_vision_tower_aux_list()
+        for vision_tower_aux in vision_tower_aux_list:
+            if not vision_tower_aux.is_loaded:
+                vision_tower_aux.load_model(device_map=device_map)
+            vision_tower_aux.to(device=device, dtype=torch.float16)
+        image_processor = [
+            vision_tower_aux.image_processor
+            for vision_tower_aux in vision_tower_aux_list
+        ]
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    return tokenizer, model, image_processor, context_len

longvu/cambrian_arch.py ADDED Viewed

	@@ -0,0 +1,1705 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import math
+import random
+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from longvu.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_PATCH_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+)
+from .multimodal_encoder.builder import build_vision_tower_aux_list
+from .multimodal_projector.builder import build_vision_projector
+from .vision_sampler import VisionTokenSampler
+IS_XLA_AVAILABLE = False
+class CambrianMetaModel:
+    def __init__(self, config):
+        super(CambrianMetaModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower_aux_list"):
+            projector_type = getattr(config, "mm_projector_type", "linear")
+            if projector_type == "sva":
+                vision_hidden_size = config.vision_hidden_size
+                num_query_group = config.num_query_group
+                query_num_list = config.query_num_list
+                connector_only = config.connector_only
+                connector_depth = config.connector_depth
+                self.vision_tower_aux_list = build_vision_tower_aux_list(
+                    config, delay_load=True
+                )
+                self.mm_projector = nn.Sequential(
+                    nn.Linear(vision_hidden_size * num_query_group, config.hidden_size),
+                    nn.GELU(),
+                    nn.Linear(config.hidden_size, config.hidden_size),
+                )
+                image_token_len = config.image_token_len
+                vision_tower_aux_token_len_list = (
+                    self.config.mm_vision_tower_aux_token_len_list
+                )
+                cross_att_token_len_list = [
+                    int(vision_tower_aux_token_len**0.5) // int(image_token_len**0.5)
+                    for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                ]
+                for aux_i, vision_tower_aux in enumerate(self.vision_tower_aux_list):
+                    setattr(
+                        self,
+                        "mm_projector_aux_{}".format(aux_i),
+                        nn.Sequential(
+                            nn.Linear(vision_tower_aux.hidden_size, vision_hidden_size),
+                            nn.GELU(),
+                            nn.Linear(vision_hidden_size, vision_hidden_size),
+                            nn.LayerNorm(vision_hidden_size),
+                        ),
+                    )
+                for query_group_i in range(num_query_group):
+                    cross_att_token_len_list = [
+                        int(vision_tower_aux_token_len**0.5)
+                        // int(query_num_list[query_group_i] ** 0.5)
+                        for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                    ]
+                    setattr(
+                        self,
+                        "vision_sampler_{}".format(query_group_i),
+                        VisionTokenSampler(
+                            vision_hidden_size,
+                            vision_hidden_size,
+                            [vision_hidden_size] * len(self.vision_tower_aux_list),
+                            cross_att_token_len_list,
+                            vision_hidden_size,
+                            connector_depth,
+                        ),
+                    )
+                if not connector_only:
+                    num_of_vision_sampler_layers = (
+                        config.num_of_vision_sampler_layers
+                    ) = config.num_of_vision_sampler_layers
+                    config.start_of_vision_sampler_layers = (
+                        config.start_of_vision_sampler_layers
+                    )
+                    config.stride_of_vision_sampler_layers = (
+                        config.stride_of_vision_sampler_layers
+                    )
+                    cross_att_token_len_list = [
+                        int(vision_tower_aux_token_len**0.5)
+                        // int(image_token_len**0.5)
+                        for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                    ]
+                    self.vision_sampler_layers = nn.ModuleList(
+                        [
+                            VisionTokenSampler(
+                                config.hidden_size,
+                                vision_hidden_size,
+                                [vision_hidden_size] * len(self.vision_tower_aux_list),
+                                cross_att_token_len_list,
+                                vision_hidden_size,
+                                1,
+                            )
+                            for layer_idx in range(0, num_of_vision_sampler_layers)
+                        ]
+                    )
+                self.vision_query = nn.Parameter(
+                    torch.randn((num_query_group, vision_hidden_size), dtype=self.dtype)
+                )
+                self.image_newline = nn.Parameter(
+                    torch.empty(config.hidden_size, dtype=self.dtype)
+                )
+                self.frame_pos = torch.stack(
+                    [
+                        1
+                        / torch.pow(
+                            torch.tensor(10000),
+                            torch.tensor(2 * (hid_j // 2) / config.hidden_size),
+                        )
+                        for hid_j in range(config.hidden_size)
+                    ]
+                )
+            else:
+                self.vision_tower_aux_list = build_vision_tower_aux_list(
+                    config, delay_load=True
+                )
+                config.mm_hidden_size = sum(
+                    [
+                        vision_tower_aux.hidden_size
+                        for vision_tower_aux in self.vision_tower_aux_list
+                    ]
+                )
+                self.mm_projector = build_vision_projector(config)
+                self.image_newline = nn.Parameter(
+                    torch.empty(config.hidden_size, dtype=self.dtype)
+                )
+    def get_frame_pos(self, time_range):
+        frame_pos = self.frame_pos.reshape(1, -1) * time_range.reshape(-1, 1).to(
+            self.frame_pos.device
+        )
+        frame_pos[:, 0::2] = torch.sin(frame_pos[:, 0::2])
+        frame_pos[:, 1::2] = torch.cos(frame_pos[:, 0::2])
+        frame_pos = frame_pos.unsqueeze(1)
+        return frame_pos
+    # def get_vision_tower(self):
+    #     vision_tower = getattr(self, 'vision_tower', None)
+    #     if type(vision_tower) is list:
+    #         vision_tower = vision_tower[0]
+    #     return vision_tower
+    def get_vision_tower_aux_list(self):
+        vision_tower_aux_list = getattr(self, "vision_tower_aux_list", None)
+        return vision_tower_aux_list
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        # vision_tower = model_args.vision_tower
+        num_query_group = model_args.num_query_group
+        query_num_list = model_args.query_num_list
+        vision_hidden_size = model_args.vision_hidden_size
+        vision_tower_aux_list = model_args.vision_tower_aux_list
+        vision_tower_aux_token_len_list = model_args.vision_tower_aux_token_len_list
+        image_token_len = model_args.image_token_len
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        connector_only = model_args.connector_only
+        connector_depth = model_args.connector_depth
+        # self.config.mm_vision_tower = vision_tower
+        self.config.image_token_len = image_token_len
+        self.config.num_query_group = num_query_group
+        self.config.query_num_list = query_num_list
+        assert num_query_group == len(query_num_list)
+        self.config.connector_depth = connector_depth
+        self.config.mm_vision_tower_aux_list = vision_tower_aux_list
+        self.config.mm_vision_tower_aux_token_len_list = vision_tower_aux_token_len_list
+        self.config.connector_only = connector_only
+        self.config.highres_connect = model_args.highres_connect
+        self.config.highres = model_args.highres
+        self.config.frame_pos = model_args.frame_pos
+        self.config.lowres_token = model_args.lowres_token
+        self.config.connect_layer = model_args.connect_layer
+        self.config.dino_threshold = getattr(model_args, "dino_threshold", 0.83)
+        self.config.drop_threshold = getattr(model_args, "drop_threshold", 0.6)
+        self.config.is_image_newline = getattr(model_args, "is_image_newline", True)
+        if self.get_vision_tower_aux_list() is None:
+            vision_tower_aux_list = build_vision_tower_aux_list(model_args)
+            if model_args.unfreeze_mm_vision_tower:
+                self.vision_tower_aux_list = nn.ModuleList(vision_tower_aux_list)
+            else:
+                self.vision_tower_aux_list = vision_tower_aux_list
+        else:
+            vision_tower_aux_list = self.vision_tower_aux_list
+            for vision_tower_aux in vision_tower_aux_list:
+                vision_tower_aux.load_model()
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(
+            model_args, "mm_projector_type", "linear"
+        )
+        self.config.vision_hidden_size = vision_hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        if getattr(self, "mm_projector", None) is None:
+            if self.config.mm_projector_type == "sva":
+                self.mm_projector = nn.Sequential(
+                    nn.Linear(
+                        vision_hidden_size * num_query_group, self.config.hidden_size
+                    ),
+                    nn.GELU(),
+                    nn.Linear(self.config.hidden_size, self.config.hidden_size),
+                )
+                for aux_i, vision_tower_aux in enumerate(vision_tower_aux_list):
+                    setattr(
+                        self,
+                        "mm_projector_aux_{}".format(aux_i),
+                        nn.Sequential(
+                            nn.Linear(vision_tower_aux.hidden_size, vision_hidden_size),
+                            nn.GELU(),
+                            nn.Linear(vision_hidden_size, vision_hidden_size),
+                            nn.LayerNorm(vision_hidden_size),
+                        ),
+                    )
+                # vision sampler for each group of query as the connector before the LLM
+                for query_group_i in range(num_query_group):
+                    cross_att_token_len_list = [
+                        int(vision_tower_aux_token_len**0.5)
+                        // int(query_num_list[query_group_i] ** 0.5)
+                        for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                    ]
+                    setattr(
+                        self,
+                        "vision_sampler_{}".format(query_group_i),
+                        VisionTokenSampler(
+                            vision_hidden_size,
+                            vision_hidden_size,
+                            [vision_hidden_size] * len(vision_tower_aux_list),
+                            cross_att_token_len_list,
+                            vision_hidden_size,
+                            connector_depth,
+                        ),
+                    )
+                # sampler layers within LLM
+                if not connector_only:
+                    num_of_vision_sampler_layers = (
+                        self.config.num_of_vision_sampler_layers
+                    ) = model_args.num_of_vision_sampler_layers
+                    self.config.start_of_vision_sampler_layers = (
+                        model_args.start_of_vision_sampler_layers
+                    )
+                    self.config.stride_of_vision_sampler_layers = (
+                        model_args.stride_of_vision_sampler_layers
+                    )
+                    cross_att_token_len_list = [
+                        int(vision_tower_aux_token_len**0.5)
+                        // int(image_token_len**0.5)
+                        for vision_tower_aux_token_len in vision_tower_aux_token_len_list
+                    ]
+                    self.vision_sampler_layers = nn.ModuleList(
+                        [
+                            VisionTokenSampler(
+                                self.config.hidden_size,
+                                vision_hidden_size,
+                                [vision_hidden_size] * len(vision_tower_aux_list),
+                                cross_att_token_len_list,
+                                vision_hidden_size,
+                                1,
+                            )
+                            for layer_idx in range(0, num_of_vision_sampler_layers)
+                        ]
+                    )
+                vision_embed_std = 1 / torch.sqrt(
+                    torch.tensor(vision_hidden_size, dtype=self.dtype)
+                )
+                self.vision_query = nn.Parameter(
+                    torch.randn((num_query_group, vision_hidden_size), dtype=self.dtype)
+                    * vision_embed_std
+                )
+                embed_std = 1 / torch.sqrt(
+                    torch.tensor(self.config.hidden_size, dtype=self.dtype)
+                )
+                self.image_newline = nn.Parameter(
+                    torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
+                )
+            else:
+                self.config.mm_hidden_size = sum(
+                    [
+                        vision_tower_aux.hidden_size
+                        for vision_tower_aux in vision_tower_aux_list
+                    ]
+                )
+                self.mm_projector = build_vision_projector(self.config)
+                embed_std = 1 / torch.sqrt(
+                    torch.tensor(self.config.hidden_size, dtype=self.dtype)
+                )
+                self.image_newline = nn.Parameter(
+                    torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
+                )
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(
+                pretrain_mm_mlp_adapter, map_location="cpu"
+            )
+            def get_w(weights, keyword):
+                return {
+                    k.split(keyword + ".")[1]: v
+                    for k, v in weights.items()
+                    if keyword + "." in k
+                }
+            self.mm_projector.load_state_dict(
+                get_w(mm_projector_weights, "mm_projector"), strict=True
+            )
+            if self.config.mm_projector_type == "sva":
+                for aux_i in range(len(vision_tower_aux_list)):
+                    getattr(self, "mm_projector_aux_{}".format(aux_i)).load_state_dict(
+                        get_w(
+                            mm_projector_weights, "mm_projector_aux_{}".format(aux_i)
+                        ),
+                        strict=True,
+                    )
+                for query_group_i in range(num_query_group):
+                    getattr(
+                        self, "vision_sampler_{}".format(query_group_i)
+                    ).load_state_dict(
+                        get_w(
+                            mm_projector_weights,
+                            "vision_sampler_{}".format(query_group_i),
+                        ),
+                        strict=True,
+                    )
+                if not connector_only:
+                    self.vision_sampler_layers.load_state_dict(
+                        get_w(mm_projector_weights, "vision_sampler_layers"),
+                        strict=True,
+                    )
+                self.vision_query.data = mm_projector_weights["model.vision_query"]
+            self.image_newline.data = mm_projector_weights["model.image_newline"]
+def unmask_attention_mask(mask, original_size):
+    original_w, original_h = original_size
+    cur_h, cur_w = mask.shape[1:3]
+    original_aspect_ratio = original_w / original_h
+    current_aspect_ratio = cur_w / cur_h
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = cur_w / original_w
+        new_height = int(original_h * scale_factor)
+        padding = (cur_h - new_height) // 2
+        if padding > 0:
+            mask[:, :padding, :] = 0
+            mask[:, -padding:, :] = 0
+        return mask
+    else:
+        scale_factor = cur_h / original_h
+        new_width = int(original_w * scale_factor)
+        padding = (cur_w - new_width) // 2
+        if padding > 0:
+            mask[:, :, :padding] = 0
+            mask[:, :, -padding:] = 0
+        return mask
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:3]
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding : current_height - padding, :]
+        # if 0 in unpadded_tensor.shape:
+        #     print(f"scale_factor: {scale_factor}, new_height: {new_height}, padding: {padding}, original_width: {original_width}, original_height: {original_height}")
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding : current_width - padding]
+        # if 0 in unpadded_tensor.shape:
+        #     print(f"scale_factor: {scale_factor}, new_width: {new_width}, padding: {padding}, original_width: {original_width}, original_height: {original_height}")
+    return unpadded_tensor
+class CambrianMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    # def get_vision_tower(self):
+    #     return self.get_model().get_vision_tower()
+    def get_vision_tower_aux_list(self):
+        return self.get_model().get_vision_tower_aux_list()
+    def rearrange_vision_tower_features_train(
+        self,
+        vision_tower_aux_feature_list,
+        vision_tower_aux_attention_masks_list,
+        query_side_len,
+    ):
+        vision_tower_aux_feature_rearranged_list = []
+        vision_tower_aux_attention_masks_rearranged_list = []
+        bs = vision_tower_aux_feature_list[0].shape[0]
+        for vision_tower_aux_feature, vision_tower_aux_attention_masks in zip(
+            vision_tower_aux_feature_list, vision_tower_aux_attention_masks_list
+        ):
+            aux_height = aux_width = int(vision_tower_aux_feature.shape[1] ** 0.5)
+            assert (aux_height // query_side_len) * query_side_len == aux_height
+            reduce_factor = aux_height // query_side_len
+            vision_tower_aux_feature_rearranged = vision_tower_aux_feature.view(
+                bs, query_side_len, reduce_factor, query_side_len, reduce_factor, -1
+            )
+            vision_tower_aux_feature_rearranged = (
+                vision_tower_aux_feature_rearranged.permute(0, 1, 3, 2, 4, 5)
+                .contiguous()
+                .flatten(0, 2)
+                .flatten(1, 2)
+            )
+            vision_tower_aux_attention_masks_rearranged = (
+                vision_tower_aux_attention_masks.view(
+                    bs * query_side_len * query_side_len, reduce_factor * reduce_factor
+                )
+            )
+            vision_tower_aux_feature_rearranged_list.append(
+                vision_tower_aux_feature_rearranged
+            )
+            vision_tower_aux_attention_masks_rearranged_list.append(
+                vision_tower_aux_attention_masks_rearranged
+            )
+        return (
+            vision_tower_aux_feature_rearranged_list,
+            vision_tower_aux_attention_masks_rearranged_list,
+        )
+    def rearrange_vision_tower_features_inference(
+        self, vision_tower_aux_feature_list, query_side_len, image_sizes, unpad=False
+    ):
+        vision_tower_aux_feature_rearranged_list = []
+        vision_tower_aux_attention_masks_rearranged_list = []
+        bs = vision_tower_aux_feature_list[0].shape[0]
+        for vision_tower_aux_feature in vision_tower_aux_feature_list:
+            aux_height = aux_width = int(vision_tower_aux_feature.shape[1] ** 0.5)
+            assert (aux_height // query_side_len) * query_side_len == aux_height
+            reduce_factor = aux_height // query_side_len
+            vision_tower_aux_feature_rearranged = []
+            vision_tower_aux_attention_masks_rearranged = []
+            for batch_i in range(bs):
+                image_size = image_sizes[batch_i]
+                cur_vision_tower_aux_feature = vision_tower_aux_feature[batch_i]
+                cur_vision_tower_aux_attention_masks_rearranged = torch.ones(
+                    (1, aux_height, aux_width),
+                    dtype=torch.bool,
+                    device=cur_vision_tower_aux_feature.device,
+                )
+                cur_vision_tower_aux_feature_rearranged = (
+                    cur_vision_tower_aux_feature.view(
+                        1,
+                        query_side_len,
+                        reduce_factor,
+                        query_side_len,
+                        reduce_factor,
+                        -1,
+                    )
+                )
+                cur_vision_tower_aux_feature_rearranged = (
+                    cur_vision_tower_aux_feature_rearranged.permute(
+                        0, 1, 3, 2, 4, 5
+                    ).contiguous()
+                )
+                if unpad:
+                    cur_vision_tower_aux_feature_rearranged = unpad_image(
+                        cur_vision_tower_aux_feature_rearranged, image_size
+                    )
+                cur_vision_tower_aux_feature_rearranged = (
+                    cur_vision_tower_aux_feature_rearranged.flatten(0, 2).flatten(1, 2)
+                )  # query_side_len*query_side_len X reduce_factor*reduce_factor X C
+                cur_vision_tower_aux_attention_masks_rearranged = unmask_attention_mask(
+                    cur_vision_tower_aux_attention_masks_rearranged, image_size
+                )
+                cur_vision_tower_aux_attention_masks_rearranged = (
+                    cur_vision_tower_aux_attention_masks_rearranged.view(
+                        1, query_side_len, reduce_factor, query_side_len, reduce_factor
+                    )
+                    .permute(0, 1, 3, 2, 4)
+                    .contiguous()
+                )
+                if unpad:
+                    cur_vision_tower_aux_attention_masks_rearranged = unpad_image(
+                        cur_vision_tower_aux_attention_masks_rearranged, image_size
+                    )
+                cur_vision_tower_aux_attention_masks_rearranged = (
+                    cur_vision_tower_aux_attention_masks_rearranged.flatten(
+                        0, 2
+                    ).flatten(1, 2)
+                )
+                cur_vision_tower_aux_attention_masks_rearranged[
+                    cur_vision_tower_aux_attention_masks_rearranged.sum(-1) == 0
+                ] = True
+                vision_tower_aux_feature_rearranged.append(
+                    cur_vision_tower_aux_feature_rearranged
+                )
+                vision_tower_aux_attention_masks_rearranged.append(
+                    cur_vision_tower_aux_attention_masks_rearranged
+                )
+            vision_tower_aux_feature_rearranged = torch.cat(
+                vision_tower_aux_feature_rearranged, 0
+            )
+            vision_tower_aux_attention_masks_rearranged = torch.cat(
+                vision_tower_aux_attention_masks_rearranged, 0
+            )
+            vision_tower_aux_feature_rearranged_list.append(
+                vision_tower_aux_feature_rearranged
+            )
+            vision_tower_aux_attention_masks_rearranged_list.append(
+                vision_tower_aux_attention_masks_rearranged
+            )
+        return (
+            vision_tower_aux_feature_rearranged_list,
+            vision_tower_aux_attention_masks_rearranged_list,
+        )
+    def encode_images(self, image_aux_list, encode_type=None):
+        vision_tower_aux_list = self.get_model().get_vision_tower_aux_list()
+        image_aux_features_list = []
+        chunk_size = 64
+        if encode_type == "dino":
+            image_aux = image_aux_list[-1]
+            vision_tower_aux = vision_tower_aux_list[-1]
+            if image_aux.shape[0] > chunk_size:
+                image_aux_features_chunks = []
+                for start_idx in range(0, image_aux.shape[0], chunk_size):
+                    end_idx = min(start_idx + chunk_size, image_aux.shape[0])
+                    chunk = image_aux[start_idx:end_idx]
+                    image_aux_features_chunk = vision_tower_aux(chunk)
+                    image_aux_features_chunks.append(image_aux_features_chunk)
+                image_aux_features = torch.cat(image_aux_features_chunks, dim=0)
+            else:
+                image_aux_features = vision_tower_aux(image_aux)
+            return image_aux_features
+        elif encode_type == "siglip":
+            image_aux = image_aux_list[0]
+            vision_tower_aux = vision_tower_aux_list[0]
+            if image_aux.shape[0] > chunk_size:
+                image_aux_features_chunks = []
+                for start_idx in range(0, image_aux.shape[0], chunk_size):
+                    end_idx = min(start_idx + chunk_size, image_aux.shape[0])
+                    chunk = image_aux[start_idx:end_idx]
+                    image_aux_features_chunk = vision_tower_aux(chunk)
+                    image_aux_features_chunks.append(image_aux_features_chunk)
+                image_aux_features = torch.cat(image_aux_features_chunks, dim=0)
+            else:
+                image_aux_features = vision_tower_aux(image_aux)
+            return image_aux_features
+        else:
+            for image_aux, vision_tower_aux in zip(
+                image_aux_list, vision_tower_aux_list
+            ):
+                if image_aux.shape[0] > chunk_size:
+                    image_aux_features_chunks = []
+                    for start_idx in range(0, image_aux.shape[0], chunk_size):
+                        end_idx = min(start_idx + chunk_size, image_aux.shape[0])
+                        chunk = image_aux[start_idx:end_idx]
+                        image_aux_features_chunk = vision_tower_aux(chunk)
+                        image_aux_features_chunks.append(image_aux_features_chunk)
+                    image_aux_features = torch.cat(image_aux_features_chunks, dim=0)
+                else:
+                    image_aux_features = vision_tower_aux(image_aux)
+                image_aux_features_list.append(image_aux_features)
+            return image_aux_features_list
+    def select_frame(
+        self,
+        feature_list,
+        split_sizes,
+        input_ids,
+        new_image_aux_list,
+        image_sizes,
+        window_size=16,
+        threshold=0.83,
+    ):
+        dino_features_batch = torch.split(feature_list, split_sizes, dim=0)
+        new_image_aux_batch_0 = torch.split(new_image_aux_list[0], split_sizes, dim=0)
+        new_image_aux_batch_1 = torch.split(new_image_aux_list[1], split_sizes, dim=0)
+        new_split_sizes = []
+        selected_frames_all_0 = []
+        selected_frames_all_1 = []
+        selected_frames_feature_all = []
+        selected_frame_indices_all = []
+        for i_batch, frame_features in enumerate(dino_features_batch):
+            try:
+                if "llama" in self.get_model().config.model_type:
+                    text_len = torch.where(input_ids[i_batch] == 128002)[-1][0]
+                else:
+                    text_len = torch.where(input_ids[i_batch] == 151643)[-1][0]
+            except:
+                text_len = len(input_ids[i_batch])
+            original_width, original_height = image_sizes[i_batch]
+            if getattr(self.get_model().config, "highres", False):
+                token_per_frame = self.get_model().config.lowres_token ** 2
+            else:
+                token_per_frame = self.get_model().config.image_token_len
+            # current_height, current_width = token_per_side, token_per_side
+            # original_aspect_ratio = original_width / original_height
+            # current_aspect_ratio = current_width / current_height
+            # if original_aspect_ratio > current_aspect_ratio:
+            #     scale_factor = current_width / original_width
+            #     new_height = int(original_height * scale_factor)
+            #     padding = math.ceil((current_height - new_height) / 2.0)
+            #     token_per_frame = (
+            #         current_height - padding * 2
+            #     ) * token_per_side + token_per_side
+            # else:
+            #     scale_factor = current_height / original_height
+            #     new_width = int(original_width * scale_factor)
+            #     padding = math.ceil((current_width - new_width) / 2.0)
+            #     token_per_frame = (current_width - padding * 2) * token_per_side + (
+            #         current_width - padding * 2
+            #     )
+            # token_per_frame = (
+            #     token_per_side**2 if token_per_frame < 1 else token_per_frame
+            # )
+            max_num_frames = max(
+                1,
+                (
+                    self.get_model().config.tokenizer_model_max_length
+                    - text_len
+                    - getattr(self.get_model().config, "inference_max_length", 16)
+                )
+                // token_per_frame,
+            )
+            if len(frame_features) < max_num_frames:
+                selected_frames_all_0.append(new_image_aux_batch_0[i_batch])
+                selected_frames_all_1.append(new_image_aux_batch_1[i_batch])
+                selected_frames_feature_all.append(frame_features)
+                new_split_sizes.append(len(frame_features))
+                selected_frame_indices_all.append(torch.arange(len(frame_features)))
+                continue
+            num_segments = len(frame_features) // window_size
+            if num_segments == 0:
+                query_feature = frame_features.flatten(1, 2)
+                query_feature = query_feature / torch.norm(
+                    (query_feature), dim=1, keepdim=True
+                )
+                similarities = torch.mean(query_feature @ query_feature.T, dim=1)
+                similarities[len(frame_features) // 2] = 0
+                indices = torch.where(similarities < threshold)[0]
+                selected_frame_indices_all.append(indices)
+                selected_frames_all_0.append(new_image_aux_batch_0[i_batch][indices])
+                selected_frames_all_1.append(new_image_aux_batch_1[i_batch][indices])
+                selected_frames_feature_all.append(frame_features[indices])
+                new_split_sizes.append(len(indices))
+                continue
+            segments_frames_0 = []
+            segments_frames_1 = []
+            segments_features = []
+            for start_idx in range(0, len(frame_features), window_size):
+                end_idx = min(start_idx + window_size, len(frame_features))
+                segments_frames_0.append(
+                    new_image_aux_batch_0[i_batch][start_idx:end_idx]
+                )
+                segments_frames_1.append(
+                    new_image_aux_batch_1[i_batch][start_idx:end_idx]
+                )
+                segments_features.append(frame_features[start_idx:end_idx])
+            selected_frames_0 = []
+            selected_frames_1 = []
+            selected_features = []
+            selected_frame_indices = []
+            for i, segment in enumerate(segments_features):
+                query_feature = segment.flatten(1, 2)
+                query_feature = query_feature / torch.norm(
+                    (query_feature), dim=1, keepdim=True
+                )
+                similarities = torch.mean(query_feature @ query_feature.T, dim=1)
+                similarities[len(segment) // 2] = 0
+                indices = torch.where(similarities < threshold)[0]
+                selected_frames_0.append(segments_frames_0[i][indices])
+                selected_frames_1.append(segments_frames_1[i][indices])
+                selected_features.append(segment[indices])
+                selected_frame_indices.extend(indices + i * window_size)
+            selected_frames_0 = torch.cat(selected_frames_0, dim=0)
+            selected_frames_1 = torch.cat(selected_frames_1, dim=0)
+            selected_features = torch.cat(selected_features, dim=0)
+            selected_frame_indices = torch.tensor(selected_frame_indices)
+            # ablation
+            max_num_frames = 400  # in case of OOM
+            if len(selected_frames_0) > max_num_frames:
+                interval = len(selected_frames_0) / float(max_num_frames)
+                indices = [int(interval * i) for i in range(max_num_frames)]
+                new_split_sizes.append(len(indices))
+                selected_frames_all_0.append(selected_frames_0[indices])
+                selected_frames_all_1.append(selected_frames_1[indices])
+                selected_frames_feature_all.append(selected_features[indices])
+                selected_frame_indices = selected_frame_indices[indices]
+            else:
+                new_split_sizes.append(len(selected_frames_0))
+                selected_frames_all_0.append(selected_frames_0)
+                selected_frames_all_1.append(selected_frames_1)
+                selected_frames_feature_all.append(selected_features)
+            selected_frame_indices_all.append(selected_frame_indices)
+        selected_frames_all_0 = torch.cat(selected_frames_all_0, dim=0)
+        selected_frames_all_1 = torch.cat(selected_frames_all_1, dim=0)
+        selected_frames_feature_all = torch.cat(selected_frames_feature_all, dim=0)
+        return (
+            selected_frames_feature_all,
+            new_split_sizes,
+            [selected_frames_all_0, selected_frames_all_1],
+            selected_frame_indices_all,
+        )
+    def prepare_inputs_labels_for_multimodal(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        past_key_values,
+        labels,
+        images,
+        image_aux_attention_masks_list=None,
+        image_sizes=None,
+    ):
+        # vision_tower = self.get_vision_tower()
+        vision_tower_aux_list = self.get_model().get_vision_tower_aux_list()
+        if vision_tower_aux_list is None or images is None or input_ids.shape[1] == 1:
+            return (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                None,
+                labels,
+                None,
+                None,
+                None,
+                None,
+            )
+        image_aux_list = images
+        split_sizes = None
+        if type(image_aux_list[0]) is list or image_aux_list[0].ndim == 5:
+            split_sizes_ori = [
+                1 if image.ndim == 3 else image.shape[0] for image in image_aux_list[0]
+            ]
+            new_image_aux_list = []
+            for image_aux in image_aux_list:
+                if type(image_aux) is list:
+                    image_aux = [
+                        x.unsqueeze(0) if x.ndim == 3 else x for x in image_aux
+                    ]
+                concat_image_aux = torch.cat([image for image in image_aux], dim=0)
+                new_image_aux_list.append(concat_image_aux)
+            image_aux_features_dino = self.encode_images(
+                new_image_aux_list, encode_type="dino"
+            )
+            (
+                image_aux_features_dino,
+                split_sizes,
+                new_image_aux_list,
+                selected_frame_indices_all,
+            ) = self.select_frame(
+                image_aux_features_dino,
+                split_sizes_ori,
+                input_ids,
+                new_image_aux_list,
+                image_sizes,
+                threshold=getattr(self.get_model().config, "dino_threshold", 0.83),
+            )
+            image_aux_features_siglip = self.encode_images(
+                new_image_aux_list, encode_type="siglip"
+            )
+            image_aux_features_list = [
+                image_aux_features_siglip,
+                image_aux_features_dino,
+            ]
+            bs = image_aux_features_list[0].shape[0]
+            dtype = new_image_aux_list[0].dtype
+            frame_sizes = []
+            for i in range(len(image_sizes)):
+                for j in range(split_sizes[i]):
+                    frame_sizes.append(image_sizes[i])
+            image_sizes = frame_sizes
+        else:
+            image_aux_features_list = self.encode_images(image_aux_list)
+            bs = image_aux_list[0].shape[0]
+            dtype = image_aux_list[0].dtype
+        image_token_len = self.get_model().config.image_token_len
+        query_num_list = self.get_model().config.query_num_list
+        final_height = final_width = int(image_token_len**0.5)
+        final_image_features_list = []
+        final_image_features_down_list = []
+        # only needed for sva
+        vision_tower_aux_feature_list_final = None
+        vision_tower_aux_attention_masks_list_final = None
+        global_context_feature_final = None
+        if self.get_model().config.mm_projector_type == "sva":
+            vision_tower_aux_feature_list = []
+            vision_tower_aux_attention_masks_list = []
+            # get vision tokens from each vision tower
+            for aux_i in range(len(vision_tower_aux_list)):
+                image_aux_features = image_aux_features_list[aux_i]
+                image_aux_features = getattr(
+                    self.get_model(), "mm_projector_aux_{}".format(aux_i)
+                )(image_aux_features).to(dtype)
+                if aux_i == 0:
+                    global_context_feature = image_aux_features.mean(1).view(
+                        bs, 1, 1, -1
+                    )
+                vision_tower_aux_feature_list.append(image_aux_features)
+            input_mix_res = True
+            input_high_res = True
+            # perform vision sampling for each query group
+            for query_group_i, query_num in enumerate(query_num_list):
+                query_features_i = (
+                    self.get_model()
+                    .vision_query[query_group_i, :]
+                    .view(1, 1, 1, -1)
+                    .expand(bs, query_num, -1, -1)
+                )
+                global_context_feature_i = global_context_feature.expand(
+                    -1, query_num, 1, -1
+                ).flatten(0, 1)
+                query_side_len = int(query_num**0.5)
+                if IS_XLA_AVAILABLE:
+                    (
+                        vision_tower_aux_feature_list_i,
+                        vision_tower_aux_attention_masks_list_i,
+                    ) = self.rearrange_vision_tower_features_train(
+                        vision_tower_aux_feature_list,
+                        image_aux_attention_masks_list,
+                        query_side_len,
+                    )
+                else:
+                    (
+                        vision_tower_aux_feature_list_i,
+                        vision_tower_aux_attention_masks_list_i,
+                    ) = self.rearrange_vision_tower_features_inference(
+                        vision_tower_aux_feature_list, query_side_len, image_sizes
+                    )
+                query_features_i = getattr(
+                    self.get_model(), "vision_sampler_{}".format(query_group_i)
+                )(
+                    query_features_i.flatten(0, 1),
+                    global_context_feature_i,
+                    *vision_tower_aux_feature_list_i,
+                    *vision_tower_aux_attention_masks_list_i,
+                )
+                query_features_i = query_features_i.view(bs, query_num, -1)
+                if split_sizes is not None:
+                    try:
+                        if "llama" in self.get_model().config.model_type:
+                            text_len = torch.where(input_ids[0] == 128002)[-1][0]
+                        else:
+                            text_len = torch.where(input_ids[0] == 151643)[-1][0]
+                    except:
+                        text_len = len(input_ids[0])
+                    max_visual_len = (
+                        self.get_model().config.tokenizer_model_max_length
+                        - text_len
+                        - getattr(self.get_model().config, "inference_max_length", 16)
+                    )
+                    max_num_frames = max(
+                        1,
+                        math.floor(max_visual_len // (final_height * final_width)),
+                    )
+                    max_num_frames_low = max(
+                        1,
+                        math.floor(
+                            max_visual_len
+                            // (self.get_model().config.lowres_token ** 2)
+                        ),
+                    )
+                    if split_sizes[0] < max_num_frames:
+                        input_mix_res = False
+                    elif split_sizes[0] > max_num_frames_low:
+                        input_mix_res = False
+                        input_high_res = False
+                # input_mix_res = False  # ablation
+                if (getattr(self.config, "highres", False)) and input_mix_res:
+                    _query_features_i = (
+                        query_features_i.permute(0, 2, 1)
+                        .contiguous()
+                        .view(bs, -1, query_side_len, query_side_len)
+                    )
+                    _query_features_i = F.interpolate(
+                        _query_features_i.float(),
+                        size=(
+                            self.get_model().config.lowres_token,
+                            self.get_model().config.lowres_token,
+                        ),
+                        mode="bilinear",
+                        align_corners=False,
+                    ).to(dtype=query_features_i.dtype)
+                    _query_features_i = (
+                        _query_features_i.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
+                    )
+                    final_image_features_down_list.append(_query_features_i)
+                # interpolate to the final target size
+                if query_side_len != final_height:
+                    query_features_i = (
+                        query_features_i.permute(0, 2, 1)
+                        .contiguous()
+                        .view(bs, -1, query_side_len, query_side_len)
+                    )
+                    if input_high_res:
+                        query_features_i = F.interpolate(
+                            query_features_i.float(),
+                            size=(final_height, final_width),
+                            mode="bilinear",
+                            align_corners=False,
+                        ).to(dtype=query_features_i.dtype)
+                    else:
+                        query_features_i = F.interpolate(
+                            query_features_i.float(),
+                            size=(8, 8),
+                            mode="bilinear",
+                            align_corners=False,
+                        ).to(dtype=query_features_i.dtype)
+                    query_features_i = (
+                        query_features_i.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
+                    )
+                final_image_features_list.append(query_features_i)
+            if IS_XLA_AVAILABLE:
+                (
+                    vision_tower_aux_feature_list_final,
+                    vision_tower_aux_attention_masks_list_final,
+                ) = self.rearrange_vision_tower_features_train(
+                    vision_tower_aux_feature_list,
+                    image_aux_attention_masks_list,
+                    final_height,
+                )
+                global_context_feature_final = global_context_feature.expand(
+                    -1, final_height * final_width, 1, -1
+                ).flatten(0, 1)
+        else:
+            final_image_features_list = image_aux_features_list
+        image_features = torch.cat(final_image_features_list, -1)
+        image_features = self.get_model().mm_projector(image_features).to(dtype)
+        if (getattr(self.config, "highres", False)) and input_mix_res:
+            image_features_down = torch.cat(final_image_features_down_list, -1)
+            image_features_down = (
+                self.get_model().mm_projector(image_features_down).to(dtype)
+            )
+        if IS_XLA_AVAILABLE:
+            image_features = image_features.view(
+                image_features.shape[0], final_height, final_width, -1
+            )
+            image_features = torch.cat(
+                (
+                    image_features,
+                    self.model.image_newline[None, None, None, :].expand(
+                        image_features.shape[0], final_height, 1, -1
+                    ),
+                ),
+                dim=2,
+            )
+            image_features = image_features.flatten(1, 2)
+            final_size = [(final_height, final_width)] * bs
+        else:
+            image_features = image_features.view(bs, final_height, final_width, -1)
+            if (getattr(self.config, "highres", False)) and input_mix_res:
+                image_features_down = image_features_down.view(
+                    bs,
+                    self.get_model().config.lowres_token,
+                    self.get_model().config.lowres_token,
+                    -1,
+                )
+            image_features_unpadded = []
+            image_features_downsample = []
+            final_size = []
+            if self.get_model().config.mm_projector_type == "sva":
+                (
+                    vision_tower_aux_feature_list_final,
+                    vision_tower_aux_attention_masks_list_final,
+                ) = self.rearrange_vision_tower_features_inference(
+                    vision_tower_aux_feature_list, final_height, image_sizes, unpad=True
+                )
+                global_context_feature_final = []
+            for batch_i in range(bs):
+                cur_image_feature = image_features[batch_i]
+                image_size = image_sizes[batch_i]
+                cur_image_feature = unpad_image(
+                    cur_image_feature.unsqueeze(0), image_size
+                )
+                cur_h, cur_w = cur_image_feature.shape[1:3]
+                try:  # fix bug for some invalid image
+                    cur_image_feature = cur_image_feature.view(1, cur_h, cur_w, -1)
+                    final_size.append((cur_h, cur_w))
+                except:
+                    # print(f"invalid after unpad {image_features[batch_i].shape}, {image_sizes[batch_i]}", flush=True)
+                    cur_image_feature = image_features[batch_i].unsqueeze(0)
+                    image_size = image_sizes[batch_i]
+                    cur_h, cur_w = cur_image_feature.shape[1:3]
+                    cur_image_feature = cur_image_feature.view(1, cur_h, cur_w, -1)
+                    final_size.append((cur_h, cur_w))
+                if (getattr(self.config, "highres", False)) and input_mix_res:
+                    cur_image_feature_down = unpad_image(
+                        image_features_down[batch_i].unsqueeze(0),
+                        (
+                            int(
+                                image_size[0]
+                                / (
+                                    image_token_len**0.5
+                                    / self.get_model().config.lowres_token
+                                )
+                            ),
+                            int(
+                                image_size[1]
+                                / (
+                                    image_token_len**0.5
+                                    / self.get_model().config.lowres_token
+                                )
+                            ),
+                        ),
+                    )
+                    _cur_h, _cur_w = cur_image_feature_down.shape[1:3]
+                    try:  # fix bug for some invalid image
+                        cur_image_feature_down = cur_image_feature_down.view(
+                            1, _cur_h, _cur_w, -1
+                        )
+                    except:
+                        print("invalid after unpad", flush=True)
+                        cur_image_feature_down = image_features_down[batch_i].unsqueeze(
+                            0
+                        )
+                        _cur_h, _cur_w = cur_image_feature_down.shape[1:3]
+                        cur_image_feature_down = cur_image_feature_down.view(
+                            1, _cur_h, _cur_w, -1
+                        )
+                    cur_image_feature_down = torch.cat(
+                        (
+                            cur_image_feature_down,
+                            self.model.image_newline.view(1, 1, 1, -1)
+                            .expand(1, _cur_h, 1, -1)
+                            .to(cur_image_feature_down.device),
+                        ),
+                        dim=2,
+                    ).flatten(1, 2)
+                    if split_sizes is None and getattr(self.config, "frame_pos", False):
+                        frame_pos = (
+                            self.get_model()
+                            .get_frame_pos(torch.arange(1))
+                            .to(cur_image_feature_down.device)
+                            .to(cur_image_feature_down.dtype)
+                        )
+                        cur_image_feature_down += frame_pos
+                    image_features_downsample.append(cur_image_feature_down.squeeze(0))
+                cur_image_feature = torch.cat(
+                    (
+                        cur_image_feature,
+                        self.model.image_newline.view(1, 1, 1, -1)
+                        .expand(1, cur_h, 1, -1)
+                        .to(cur_image_feature.device),
+                    ),
+                    dim=2,
+                )
+                if split_sizes is None and getattr(self.config, "frame_pos", False):
+                    frame_pos = (
+                        self.get_model()
+                        .get_frame_pos(torch.arange(1))
+                        .to(cur_image_feature.device)
+                        .to(cur_image_feature.dtype)
+                    )
+                    cur_image_feature += frame_pos
+                cur_image_feature = cur_image_feature.flatten(1, 2)
+                image_features_unpadded.append(cur_image_feature.squeeze(0))
+                if self.get_model().config.mm_projector_type == "sva":
+                    cur_global_context_feature = global_context_feature[batch_i].expand(
+                        cur_h * cur_w, 1, -1
+                    )
+                    global_context_feature_final.append(cur_global_context_feature)
+            if self.get_model().config.mm_projector_type == "sva":
+                global_context_feature_final = torch.cat(
+                    global_context_feature_final, 0
+                )
+            if (getattr(self.config, "highres", False)) and input_mix_res:
+                image_features = image_features_downsample
+            else:
+                image_features = image_features_unpadded
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
+            self.config, "mm_use_im_start_end", False
+        ):
+            raise NotImplementedError
+        split_image_features_unpadded = None
+        frame_split_sizes = None
+        if split_sizes is not None:
+            split_image_features = []
+            split_image_features_unpadded = (
+                []
+                if (getattr(self.config, "highres", False)) and input_mix_res
+                else None
+            )
+            start_idx = 0
+            for split_batch_idx, split_size in enumerate(split_sizes):
+                if isinstance(image_features[start_idx : start_idx + split_size], list):
+                    if getattr(self.config, "frame_pos", False):
+                        frame_feature = torch.cat(
+                            image_features[start_idx : start_idx + split_size], dim=0
+                        ).reshape(split_size, -1, image_features[0].shape[-1])
+                        frame_pos = (
+                            self.get_model()
+                            .get_frame_pos(selected_frame_indices_all[split_batch_idx])
+                            .to(frame_feature.device)
+                            .to(frame_feature.dtype)
+                        )
+                        frame_feature += frame_pos
+                        split_image_features.append(
+                            frame_feature.reshape(-1, image_features[0].shape[-1])
+                        )
+                    else:
+                        split_image_features.append(
+                            torch.cat(
+                                image_features[start_idx : start_idx + split_size],
+                                dim=0,
+                            )
+                        )
+                    if (getattr(self.config, "highres", False)) and input_mix_res:
+                        if getattr(self.config, "frame_pos", False):
+                            frame_feature = torch.cat(
+                                image_features_unpadded[
+                                    start_idx : start_idx + split_size
+                                ],
+                                dim=0,
+                            ).reshape(split_size, -1, image_features[0].shape[-1])
+                            frame_pos = (
+                                self.get_model()
+                                .get_frame_pos(
+                                    selected_frame_indices_all[split_batch_idx]
+                                )
+                                .to(frame_feature.device)
+                                .to(frame_feature.dtype)
+                            )
+                            frame_feature += frame_pos
+                            split_image_features_unpadded.append(
+                                frame_feature.reshape(-1, image_features[0].shape[-1])
+                            )
+                        else:
+                            split_image_features_unpadded.append(
+                                torch.cat(
+                                    image_features_unpadded[
+                                        start_idx : start_idx + split_size
+                                    ],
+                                    dim=0,
+                                )
+                            )
+                else:
+                    if getattr(self.config, "frame_pos", False):
+                        frame_feature = image_features[
+                            start_idx : start_idx + split_size
+                        ].reshape(split_size, -1, image_features[0].shape[-1])
+                        frame_pos = (
+                            self.get_model()
+                            .get_frame_pos(selected_frame_indices_all[split_batch_idx])
+                            .to(frame_feature.device)
+                            .to(frame_feature.dtype)
+                        )
+                        frame_feature += frame_pos
+                        split_image_features.append(
+                            frame_feature.reshape(-1, image_features[0].shape[-1])
+                        )
+                    else:
+                        split_image_features.append(
+                            image_features[start_idx : start_idx + split_size]
+                        )
+                    if (getattr(self.config, "highres", False)) and input_mix_res:
+                        if getattr(self.config, "frame_pos", False):
+                            frame_feature = image_features_unpadded[
+                                start_idx : start_idx + split_size
+                            ]
+                            frame_pos = (
+                                self.get_model()
+                                .get_frame_pos(
+                                    selected_frame_indices_all[split_batch_idx]
+                                )
+                                .to(frame_feature.device)
+                                .to(frame_feature.dtype)
+                            )
+                            frame_feature += frame_pos
+                            split_image_features_unpadded.append(
+                                frame_feature.reshape(-1, image_features[0].shape[-1])
+                            )
+                        else:
+                            split_image_features_unpadded.append(
+                                image_features_unpadded[
+                                    start_idx : start_idx + split_size
+                                ]
+                            )
+                start_idx += split_size
+            image_features = split_image_features
+            frame_split_sizes = split_sizes
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(
+                0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
+            )
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        # remove the padding using attention_mask -- FIXME
+        _input_ids = input_ids
+        attention_mask = attention_mask | (input_ids == IMAGE_TOKEN_INDEX)
+        input_ids = [
+            cur_input_ids[cur_attention_mask]
+            for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+        ]
+        labels = [
+            cur_labels[cur_attention_mask]
+            for cur_labels, cur_attention_mask in zip(labels, attention_mask)
+        ]
+        new_input_embeds = []
+        new_labels = []
+        image_token_indices_batch = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat(
+                    [cur_input_embeds_1, cur_image_features[0:0]], dim=0
+                )
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = (
+                [-1]
+                + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()
+                + [cur_input_ids.shape[0]]
+            )
+            image_token_indices_batch.append(
+                torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()[0]
+            )
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(
+                    cur_input_ids[
+                        image_token_indices[i] + 1 : image_token_indices[i + 1]
+                    ]
+                )
+                cur_labels_noim.append(
+                    cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]]
+                )
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(
+                torch.cat(cur_input_ids_noim)
+            )
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            text_len = sum([x.shape[0] for x in cur_input_embeds_no_im])
+            visual_len = len(image_features[cur_image_idx])
+            max_visual_len = (
+                self.get_model().config.tokenizer_model_max_length
+                - getattr(self.get_model().config, "inference_max_length", 16)
+                - text_len
+            )
+            mix_token = False
+            # ablation mix
+            if (
+                input_mix_res
+                and (
+                    self.get_model().config.image_token_len
+                    > getattr(self.get_model().config, "lowres_token", 8) ** 2
+                )
+                and frame_split_sizes is not None
+                and getattr(self.config, "highres", False)
+            ):
+                if max_visual_len > visual_len:
+                    visual_emb = image_features[cur_image_idx]
+                    text_emb = cur_input_embeds_no_im[-1]
+                    highres_num = math.floor(
+                        (max_visual_len - visual_len)
+                        / (
+                            split_image_features_unpadded[cur_image_idx].shape[0]
+                            // frame_split_sizes[cur_image_idx]
+                            - visual_emb.shape[0] // frame_split_sizes[cur_image_idx]
+                        )
+                    )
+                    if highres_num >= 1:
+                        mix_token = True
+                        sim = torch.matmul(visual_emb, text_emb.transpose(0, 1)).mean(
+                            dim=-1
+                        )
+                        sim_frame = sim.reshape(
+                            frame_split_sizes[cur_image_idx], -1
+                        ).mean(dim=-1)
+                        highres_num = min(highres_num, sim_frame.shape[0])
+                        top_values, top_indices = torch.topk(sim_frame, highres_num)
+                        if len(top_indices) > 0:
+                            sorted_indices = torch.sort(top_indices)[1]
+                            top_indices = top_indices[sorted_indices]
+                            visual_emb_frame = image_features[cur_image_idx].reshape(
+                                frame_split_sizes[cur_image_idx],
+                                -1,
+                                image_features[cur_image_idx].shape[-1],
+                            )
+                            visual_emb_frame_highres = split_image_features_unpadded[
+                                cur_image_idx
+                            ].reshape(
+                                frame_split_sizes[cur_image_idx],
+                                -1,
+                                split_image_features_unpadded[cur_image_idx].shape[-1],
+                            )
+                            current_point = 0
+                            mix_visual_emb_frame = []
+                            for frame_i in range(len(visual_emb_frame)):
+                                if current_point > len(top_indices) - 1:
+                                    mix_visual_emb_frame.append(
+                                        visual_emb_frame[frame_i]
+                                    )
+                                    continue
+                                if frame_i == top_indices[current_point]:
+                                    mix_visual_emb_frame.append(
+                                        visual_emb_frame_highres[frame_i]
+                                    )
+                                    current_point += 1
+                                else:
+                                    mix_visual_emb_frame.append(
+                                        visual_emb_frame[frame_i]
+                                    )
+                            image_features[cur_image_idx] = torch.cat(
+                                mix_visual_emb_frame, dim=0
+                            )
+            # ablation drop
+            if (
+                max_visual_len < visual_len
+                and frame_split_sizes is not None
+                and not mix_token
+            ):
+                visual_emb_frame = image_features[cur_image_idx].reshape(
+                    frame_split_sizes[cur_image_idx],
+                    -1,
+                    image_features[cur_image_idx].shape[-1],
+                )
+                sim = F.cosine_similarity(
+                    visual_emb_frame[:-1],
+                    visual_emb_frame[1:],
+                    dim=-1,
+                )
+                new_visual_emb_frames = []
+                for start_idx in range(0, len(visual_emb_frame), 8):
+                    end_idx = min(start_idx + 8, len(visual_emb_frame))
+                    chunk_feature = visual_emb_frame[start_idx:end_idx]  # 8, HW, C
+                    if len(chunk_feature) == 1:
+                        new_visual_emb_frames.append(chunk_feature[0])
+                        continue
+                    sim = F.cosine_similarity(
+                        chunk_feature[0]
+                        .unsqueeze(0)
+                        .repeat_interleave(len(chunk_feature[1:]), dim=0),
+                        chunk_feature[1:],
+                        dim=-1,
+                    )
+                    new_visual_emb_frame = torch.cat(
+                        [
+                            chunk_feature[0],
+                            chunk_feature[1:].flatten(0, 1)[
+                                sim.flatten(0, 1)
+                                < getattr(
+                                    self.get_model().config, "drop_threshold", 0.7
+                                )
+                            ],
+                        ],
+                        dim=0,
+                    )
+                    new_visual_emb_frames.append(new_visual_emb_frame)
+                reduced_visual_len = sum([x.shape[0] for x in new_visual_emb_frames])
+                if reduced_visual_len > max_visual_len:
+                    force_remove = math.ceil(
+                        (reduced_visual_len - max_visual_len)
+                        / len(new_visual_emb_frames)
+                    )
+                    for chunk_i in range(len(new_visual_emb_frames)):
+                        new_visual_emb_frames[chunk_i] = new_visual_emb_frames[chunk_i][
+                            :-force_remove
+                        ]
+                    new_visual_emb_frames = torch.cat(new_visual_emb_frames, dim=0)
+                else:
+                    new_visual_emb_frames = torch.cat(new_visual_emb_frames, dim=0)
+                image_features[cur_image_idx] = new_visual_emb_frames[:max_visual_len]
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(
+                        torch.full(
+                            (cur_image_features.shape[0],),
+                            IGNORE_INDEX,
+                            device=cur_labels.device,
+                            dtype=cur_labels.dtype,
+                        )
+                    )
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(
+            self.config, "tokenizer_model_max_length", None
+        )
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [
+                x[:tokenizer_model_max_length] for x in new_input_embeds
+            ]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full(
+            (batch_size, max_len),
+            IGNORE_INDEX,
+            dtype=new_labels[0].dtype,
+            device=new_labels[0].device,
+        )
+        attention_mask = torch.zeros(
+            (batch_size, max_len),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device,
+        )
+        position_ids = torch.zeros(
+            (batch_size, max_len),
+            dtype=position_ids.dtype,
+            device=position_ids.device,
+        )
+        for i, (cur_new_embed, cur_new_labels) in enumerate(
+            zip(new_input_embeds, new_labels)
+        ):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                            cur_new_embed,
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(
+                        0,
+                        cur_len,
+                        dtype=position_ids.dtype,
+                        device=position_ids.device,
+                    )
+            else:
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            cur_new_embed,
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(
+                        0,
+                        cur_len,
+                        dtype=position_ids.dtype,
+                        device=position_ids.device,
+                    )
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        return (
+            None,
+            position_ids,
+            attention_mask,
+            past_key_values,
+            new_input_embeds,
+            new_labels,
+            vision_tower_aux_feature_list_final,
+            vision_tower_aux_attention_masks_list_final,
+            final_size,
+            global_context_feature_final,
+        )
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
+            self.resize_token_embeddings(len(tokenizer))
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True
+                )
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True
+                )
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(
+                    model_args.pretrain_mm_mlp_adapter, map_location="cpu"
+                )
+                embed_tokens_weight = mm_projector_weights["model.embed_tokens.weight"]
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[
+                        -num_new_tokens:
+                    ]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(
+                        f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}."
+                    )
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False

longvu/consolidate.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# pyre-unsafe
+"""
+Usage:
+python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
+"""
+import argparse
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from longvu import *  # noqa
+from .utils import auto_upgrade
+def consolidate_ckpt(src_path, dst_path):
+    print("Loading model")
+    auto_upgrade(src_path)
+    src_model = AutoModelForCausalLM.from_pretrained(
+        src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
+    src_model.save_pretrained(dst_path)
+    src_tokenizer.save_pretrained(dst_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str, required=True)
+    parser.add_argument("--dst", type=str, required=True)
+    args = parser.parse_args()
+    consolidate_ckpt(args.src, args.dst)

longvu/constants.py ADDED Viewed

	@@ -0,0 +1,13 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"

longvu/conversation.py ADDED Viewed

	@@ -0,0 +1,606 @@

+import base64
+import dataclasses
+from enum import auto, Enum
+from io import BytesIO
+from typing import Any, Dict, List, Tuple, Union
+from longvu.file_io import PathManager
+from PIL import Image
+from transformers import AutoTokenizer
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    LLAMA_3 = auto()
+    LLAMA_3_1 = auto()
+    LLAMA_3_2 = auto()
+    QWEN = auto()
+    CHATML = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    # pyre-fixme[8]: Attribute has type `str`; used as `None`.
+    sep2: str = None
+    version: str = "Unknown"
+    tokenizer: Any = None
+    # Stop criteria (the default one is EOS token)
+    # pyre-fixme[8]: Attribute has type `Union[List[str], str]`; used as `None`.
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    # pyre-fixme[8]: Attribute has type `List[int]`; used as `None`.
+    stop_token_ids: List[int] = None
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if "mmtag" in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if self.system == "" else self.system + self.sep + "\n"
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images, _ = message
+                        message = "<image>" * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: (
+                f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            )
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            if self.tokenizer is None:
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    PathManager.get_local_path(
+                        "manifold://xr_core_ai_asl_llm/tree/users/shenx/models/Cambrian-Llama3_1-8b-t576/"
+                    )
+                )
+            chat_template_messages = [{"role": "system", "content": self.system}]
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = "<image>" * len(images) + message
+                    chat_template_messages.append({"role": role, "content": message})
+            # print("chat", chat_template_messages, flush=True)
+            return self.tokenizer.apply_chat_template(
+                chat_template_messages, tokenize=False, add_generation_prompt=True
+            )
+        elif self.sep_style == SeparatorStyle.LLAMA_3_1:
+            if self.tokenizer is None:
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    PathManager.get_local_path(
+                        "manifold://xr_core_ai_asl_llm/tree/users/shenx/models/Cambrian-Llama3_1-8b-t576/"
+                    )
+                )
+            chat_template_messages = [{"role": "system", "content": self.system}]
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = "<image>" * len(images) + message
+                    chat_template_messages.append({"role": role, "content": message})
+            return self.tokenizer.apply_chat_template(
+                chat_template_messages, tokenize=False, add_generation_prompt=False
+            )
+        elif (
+            # self.sep_style == SeparatorStyle.LLAMA_3 or
+            self.sep_style
+            == SeparatorStyle.LLAMA_3_2
+        ):
+            wrap_sys = lambda msg: (
+                f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>{msg}<|eot_id|>"
+                if len(msg) > 0
+                else msg
+            )
+            wrap_inst_user = (
+                lambda msg: f"<|start_header_id|>user<|end_header_id|>{msg}<|eot_id|>"
+            )
+            wrap_inst_assistant = (
+                lambda msg: f"<|start_header_id|>assistant<|end_header_id|>{msg}<|eot_id|>"
+            )
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        ret += wrap_sys(self.system)
+                    if i % 2 == 0:
+                        message = wrap_inst_user(message)
+                        ret += message
+                    else:
+                        message = wrap_inst_assistant(message)
+                        ret += message
+                else:
+                    ret += ""
+            ret += "<|start_header_id|>assistant<|end_header_id|>"
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def process_image(
+        self,
+        image,
+        image_process_mode,
+        return_pil=False,
+        image_format="PNG",
+        max_len=1344,
+        min_len=672,
+    ):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    image = self.process_image(
+                        image, image_process_mode, return_pil=return_pil
+                    )
+                    images.append(image)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    img_b64_str = self.process_image(
+                        image, "Default", return_pil=False, image_format="JPEG"
+                    )
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace("<image>", "").strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+        )
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [
+                    [x, y[0] if type(y) is tuple else y] for x, y in self.messages
+                ],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("Human", "Assistant"),
+    # pyre-fixme[6]: For 3rd argument expected `List[List[str]]` but got
+    #  `Tuple[Tuple[str, str], Tuple[str, str]]`.
+    messages=(
+        (
+            "Human",
+            "What are the key differences between renewable and non-renewable energy sources?",
+        ),
+        (
+            "Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
+        ),
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    # pyre-fixme[6]: For 4th argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    # pyre-fixme[6]: For 4th argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    # pyre-fixme[6]: For 4th argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    # pyre-fixme[6]: For 4th argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("", ""),
+    # pyre-fixme[6]: For 3rd argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+    version="plain",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("Human", "Assistant"),
+    # pyre-fixme[6]: For 3rd argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("Human", "Assistant"),
+    # pyre-fixme[6]: For 3rd argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    # pyre-fixme[6]: For 4th argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("USER", "ASSISTANT"),
+    # pyre-fixme[6]: For 3rd argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+conv_mistral_instruct = Conversation(
+    system="",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    # pyre-fixme[6]: For 4th argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="",
+    sep2="</s>",
+)
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    # pyre-fixme[6]: For 4th argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+# llama3_tokenizer = AutoTokenizer.from_pretrained(
+#     PathManager.get_local_path(
+#         "./checkpoint/"
+#     )
+# )
+conv_llama3 = Conversation(
+    system="""As a multimodal AI, you have the ability to process and analyze images. Whenever an image is present in the conversation, very carefully examine it and consider its content when formulating your response. You should give concise responses to very simple questions, but provide thorough responses to more complex and open-ended questions.""",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("user", "assistant"),
+    version="llama3",
+    # pyre-fixme[6]: For 4th argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    # tokenizer=llama3_tokenizer,
+    sep="<|eot_id|>",
+)
+conv_llama3_2 = Conversation(
+    system="""You are a helpful assistant.""",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("user", "assistant"),
+    version="llama3_2",
+    # pyre-fixme[6]: For 4th argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3_2,
+    sep="<|eot_id|>",
+)
+conv_phi3_instruct = Conversation(
+    system="""<|system|>\nYou are a helpful AI assistant.""",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("\n<|user|>\n", "\n<|assistant|>\n"),
+    version="phi3",
+    # pyre-fixme[6]: For 4th argument expected `List[List[str]]` but got `Tuple[]`.
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|end|>",
+)
+conv_qwen = Conversation(
+    system="""<|im_start|>system
+You are a helpful assistant.""",
+    # pyre-fixme[6]: For 2nd argument expected `List[str]` but got `Tuple[str, str]`.
+    roles=("<|im_start|>user", "<|im_start|>assistant"),
+    version="qwen",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.CHATML,
+    sep="<|im_end|>",
+)
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "mistral_instruct": conv_mistral_instruct,
+    "chatml_direct": conv_chatml_direct,
+    "mistral_direct": conv_chatml_direct,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "mpt": conv_mpt,
+    "llama3": conv_llama3,
+    "llama3_2": conv_llama3_2,
+    "phi3": conv_phi3_instruct,
+    "qwen": conv_qwen,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

longvu/file_io.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and its affiliates.
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+from iopath.common.file_io import HTTPURLHandler, PathManager as PathManagerBase
+__all__ = ["PathManager"]
+PathManager = PathManagerBase()
+PathManager.register_handler(HTTPURLHandler())

longvu/language_model/__pycache__/cambrian_llama.cpython-310.pyc ADDED Viewed

Binary file (8.51 kB). View file

longvu/language_model/__pycache__/cambrian_qwen.cpython-310.pyc ADDED Viewed

Binary file (7.98 kB). View file

longvu/language_model/cambrian_llama.py ADDED Viewed

	@@ -0,0 +1,546 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    LlamaConfig,
+    LlamaForCausalLM,
+    LlamaModel,
+)
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.utils import logging
+from ..cambrian_arch import CambrianMetaForCausalLM, CambrianMetaModel
+IS_XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)
+class CambrianConfig(LlamaConfig):
+    model_type = "cambrian_llama"
+    debug = "debug"
+class CambrianLlamaModel(CambrianMetaModel, LlamaModel):
+    config_class = CambrianConfig
+    def __init__(self, config: LlamaConfig):
+        super(CambrianLlamaModel, self).__init__(config)
+    def forward(
+        self,
+        # pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        vision_tower_aux_feature_list: Optional[List[torch.FloatTensor]] = None,
+        vision_tower_aux_attention_masks_list: Optional[List[torch.Tensor]] = None,
+        final_vision_feature_size: Optional[List[tuple]] = None,
+        global_context_feature: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            # pyre-fixme[16]: `CambrianLlamaModel` has no attribute `config`.
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        # pyre-fixme[16]: `CambrianLlamaModel` has no attribute
+        #  `gradient_checkpointing`.
+        # pyre-fixme[16]: `CambrianLlamaModel` has no attribute `training`.
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                # pyre-fixme[9]: past_key_values has type
+                #  `Optional[List[FloatTensor]]`; used as `DynamicCache`.
+                # pyre-fixme[6]: For 1st argument expected
+                #  `Optional[Tuple[Tuple[FloatTensor]]]` but got
+                #  `Optional[List[FloatTensor]]`.
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            # pyre-fixme[16]: `Optional` has no attribute `get_usable_length`.
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            # pyre-fixme[16]: `Optional` has no attribute `device`.
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            # pyre-fixme[16]: `CambrianLlamaModel` has no attribute `embed_tokens`.
+            inputs_embeds = self.embed_tokens(input_ids)
+        # pyre-fixme[16]: `CambrianLlamaModel` has no attribute
+        #  `_use_flash_attention_2`.
+        self._use_flash_attention_2 = getattr(self, "_use_flash_attention_2", False)
+        # pyre-fixme[16]: `CambrianLlamaModel` has no attribute `_use_sdpa`.
+        self._use_sdpa = getattr(self, "_use_sdpa", True)
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = (
+                attention_mask
+                if (attention_mask is not None and 0 in attention_mask)
+                else None
+            )
+        elif self._use_sdpa and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        # embed positions
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        # pyre-fixme[16]: `CambrianLlamaModel` has no attribute `layers`.
+        for i, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                # pyre-fixme[16]: `CambrianLlamaModel` has no attribute
+                #  `_gradient_checkpointing_func`.
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        # pyre-fixme[16]: `CambrianLlamaModel` has no attribute `norm`.
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache()
+                # pyre-fixme[61]: `use_legacy_cache` is undefined, or not always
+                #  defined.
+                if use_legacy_cache
+                else next_decoder_cache
+            )
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class CambrianLlamaForCausalLM(LlamaForCausalLM, CambrianMetaForCausalLM):
+    config_class = CambrianConfig
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = CambrianLlamaModel(config)
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        # pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_aux_attention_masks_list: Optional[List[torch.Tensor]] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+        cache_position=None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        final_vision_feature_size = None
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+                vision_tower_aux_feature_list,
+                vision_tower_aux_attention_masks_list,
+                final_vision_feature_size,
+                global_context_feature,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_aux_attention_masks_list,
+                image_sizes,
+            )
+        if IS_XLA_AVAILABLE:
+            # Very Important for TorchXLA
+            # self.model.gradient_checkpointing = False
+            # pyre-fixme[21]: Could not find module `torch_xla.utils.checkpoint`.
+            from torch_xla.utils.checkpoint import checkpoint
+            # self.model.gradient_checkpointing = True
+            # pyre-fixme[16]: `CambrianLlamaModel` has no attribute
+            #  `_gradient_checkpointing_func`.
+            self.model._gradient_checkpointing_func = checkpoint
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no attribute `config`.
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # training
+        if IS_XLA_AVAILABLE:
+            # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+            # pyre-fixme[29]: `CambrianLlamaModel` is not a function.
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                # pyre-fixme[61]: `vision_tower_aux_feature_list` is undefined, or
+                #  not always defined.
+                vision_tower_aux_feature_list=vision_tower_aux_feature_list,
+                # pyre-fixme[61]: `vision_tower_aux_attention_masks_list` is
+                #  undefined, or not always defined.
+                vision_tower_aux_attention_masks_list=vision_tower_aux_attention_masks_list,
+                final_vision_feature_size=final_vision_feature_size,
+                # pyre-fixme[61]: `global_context_feature` is undefined, or not
+                #  always defined.
+                global_context_feature=global_context_feature,
+            )
+        # inference
+        else:
+            if hasattr(self, "vision_tower_aux_feature_list"):
+                # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+                # pyre-fixme[29]: `CambrianLlamaModel` is not a function.
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    vision_tower_aux_feature_list=(
+                        # pyre-fixme[61]: `vision_tower_aux_feature_list` is
+                        #  undefined, or not always defined.
+                        vision_tower_aux_feature_list
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no
+                        #  attribute `vision_tower_aux_feature_list`.
+                        else self.vision_tower_aux_feature_list
+                    ),
+                    vision_tower_aux_attention_masks_list=(
+                        # pyre-fixme[61]: `vision_tower_aux_attention_masks_list` is
+                        #  undefined, or not always defined.
+                        vision_tower_aux_attention_masks_list
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no
+                        #  attribute `vision_tower_aux_attention_masks_list`.
+                        else self.vision_tower_aux_attention_masks_list
+                    ),
+                    final_vision_feature_size=(
+                        final_vision_feature_size
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no
+                        #  attribute `final_vision_feature_size`.
+                        else self.final_vision_feature_size
+                    ),
+                    global_context_feature=(
+                        # pyre-fixme[61]: `global_context_feature` is undefined, or
+                        #  not always defined.
+                        global_context_feature
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no
+                        #  attribute `global_context_feature`.
+                        else self.global_context_feature
+                    ),
+                )
+            else:
+                # pyre-fixme[29]: `CambrianLlamaModel` is not a function.
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    # final_vision_feature_size=final_vision_feature_size,
+                )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(
+                self.vocab_size // self.config.pretraining_tp, dim=0
+            )
+            logits = [
+                F.linear(hidden_states, lm_head_slices[i])
+                for i in range(self.config.pretraining_tp)
+            ]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _,
+                vision_tower_aux_feature_list,
+                vision_tower_aux_attention_masks_list,
+                final_vision_feature_size,
+                global_context_feature,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes,
+            )
+            # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no attribute
+            #  `vision_tower_aux_feature_list`.
+            self.vision_tower_aux_feature_list = vision_tower_aux_feature_list
+            # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no attribute
+            #  `vision_tower_aux_attention_masks_list`.
+            self.vision_tower_aux_attention_masks_list = (
+                vision_tower_aux_attention_masks_list
+            )
+            # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no attribute
+            #  `final_vision_feature_size`.
+            self.final_vision_feature_size = final_vision_feature_size
+            # pyre-fixme[16]: `CambrianLlamaForCausalLM` has no attribute
+            #  `global_context_feature`.
+            self.global_context_feature = global_context_feature
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        # pyre-fixme[16]: `LlamaForCausalLM` has no attribute `generate`.
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+AutoConfig.register("cambrian_llama", CambrianConfig)
+AutoModelForCausalLM.register(CambrianConfig, CambrianLlamaForCausalLM)

longvu/language_model/cambrian_qwen.py ADDED Viewed

	@@ -0,0 +1,471 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.utils import logging
+from ..cambrian_arch import CambrianMetaForCausalLM, CambrianMetaModel
+IS_XLA_AVAILABLE = False
+from transformers import Qwen2Config, Qwen2ForCausalLM, Qwen2Model
+logger = logging.get_logger(__name__)
+class CambrianConfig(Qwen2Config):
+    model_type = "cambrian_qwen"
+    debug = "debug"
+class CambrianQwenModel(CambrianMetaModel, Qwen2Model):
+    config_class = CambrianConfig
+    def __init__(self, config: Qwen2Config):
+        super(CambrianQwenModel, self).__init__(config)
+    def forward(
+        self,
+        # pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        vision_tower_aux_feature_list: Optional[List[torch.FloatTensor]] = None,
+        vision_tower_aux_attention_masks_list: Optional[List[torch.Tensor]] = None,
+        final_vision_feature_size: Optional[List[tuple]] = None,
+        global_context_feature: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            # pyre-fixme[16]: `CambrianQwenModel` has no attribute `config`.
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        # pyre-fixme[16]: `CambrianQwenModel` has no attribute `gradient_checkpointing`.
+        # pyre-fixme[16]: `CambrianQwenModel` has no attribute `training`.
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        use_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            use_legacy_cache = True
+            # pyre-fixme[6]: For 1st argument expected
+            #  `Optional[Tuple[Tuple[FloatTensor]]]` but got
+            #  `Optional[List[FloatTensor]]`.
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            logger.warning_once(
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+            )
+        if inputs_embeds is None:
+            # pyre-fixme[16]: `CambrianQwenModel` has no attribute `embed_tokens`.
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = (
+                # pyre-fixme[16]: Item `List` of `Union[List[torch._C.FloatTensor],
+                #  DynamicCache]` has no attribute `get_seq_length`.
+                past_key_values.get_seq_length() if past_key_values is not None else 0
+            )
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        # pyre-fixme[16]: `CambrianQwenModel` has no attribute `_update_causal_mask`.
+        causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            past_key_values,
+            output_attentions,
+        )
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        # pyre-fixme[16]: `CambrianQwenModel` has no attribute `layers`.
+        for i, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                # pyre-fixme[16]: `CambrianQwenModel` has no attribute
+                #  `_gradient_checkpointing_func`.
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        # pyre-fixme[16]: `CambrianQwenModel` has no attribute `norm`.
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache()
+                if use_legacy_cache
+                else next_decoder_cache
+            )
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class CambrianQwenForCausalLM(Qwen2ForCausalLM, CambrianMetaForCausalLM):
+    config_class = CambrianConfig
+    def __init__(self, config):
+        # super(Qwen2ForCausalLM, self).__init__(config)
+        Qwen2ForCausalLM.__init__(self, config)
+        config.model_type = "cambrian_qwen"
+        config.rope_scaling = None
+        self.model = CambrianQwenModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        # pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_aux_attention_masks_list: Optional[List[torch.Tensor]] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+        modalities: Optional[List[str]] = ["image"],
+        dpo_forward: Optional[bool] = False,
+        cache_position=None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        input_image_features = None
+        highres_image_features = None
+        frame_split_sizes = None
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+                vision_tower_aux_feature_list,
+                vision_tower_aux_attention_masks_list,
+                final_vision_feature_size,
+                global_context_feature,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_aux_attention_masks_list,
+                image_sizes,
+            )
+        if dpo_forward:
+            # pyre-fixme[29]: `CambrianQwenModel` is not a function.
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            return logits, labels
+        else:
+            if hasattr(self, "vision_tower_aux_feature_list"):
+                # pyre-fixme[29]: `CambrianQwenModel` is not a function.
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    vision_tower_aux_feature_list=(
+                        # pyre-fixme[61]: `vision_tower_aux_feature_list` is
+                        #  undefined, or not always defined.
+                        vision_tower_aux_feature_list
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+                        #  `vision_tower_aux_feature_list`.
+                        else self.vision_tower_aux_feature_list
+                    ),
+                    vision_tower_aux_attention_masks_list=(
+                        # pyre-fixme[61]: `vision_tower_aux_attention_masks_list` is
+                        #  undefined, or not always defined.
+                        vision_tower_aux_attention_masks_list
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+                        #  `vision_tower_aux_attention_masks_list`.
+                        else self.vision_tower_aux_attention_masks_list
+                    ),
+                    final_vision_feature_size=(
+                        # pyre-fixme[61]: `final_vision_feature_size` is undefined,
+                        #  or not always defined.
+                        final_vision_feature_size
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+                        #  `final_vision_feature_size`.
+                        else self.final_vision_feature_size
+                    ),
+                    global_context_feature=(
+                        # pyre-fixme[61]: `global_context_feature` is undefined, or
+                        #  not always defined.
+                        global_context_feature
+                        if inputs_embeds is None
+                        # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+                        #  `global_context_feature`.
+                        else self.global_context_feature
+                    ),
+                )
+            else:
+                # pyre-fixme[29]: `CambrianQwenModel` is not a function.
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    # final_vision_feature_size=final_vision_feature_size,
+                )
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            logits = logits.float()
+            loss = None
+            if labels is not None:
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss()
+                # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute `config`.
+                shift_logits = shift_logits.view(-1, self.config.vocab_size)
+                shift_labels = shift_labels.view(-1)
+                # Enable model parallelism
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss = loss_fct(shift_logits, shift_labels)
+            if not return_dict:
+                output = (logits,) + outputs[1:]
+                return (loss,) + output if loss is not None else output
+            return CausalLMOutputWithPast(
+                loss=loss,
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _,
+                vision_tower_aux_feature_list,
+                vision_tower_aux_attention_masks_list,
+                final_vision_feature_size,
+                global_context_feature,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes,
+            )
+            # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+            #  `vision_tower_aux_feature_list`.
+            self.vision_tower_aux_feature_list = vision_tower_aux_feature_list
+            # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+            #  `vision_tower_aux_attention_masks_list`.
+            self.vision_tower_aux_attention_masks_list = (
+                vision_tower_aux_attention_masks_list
+            )
+            # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+            #  `final_vision_feature_size`.
+            self.final_vision_feature_size = final_vision_feature_size
+            # pyre-fixme[16]: `CambrianQwenForCausalLM` has no attribute
+            #  `global_context_feature`.
+            self.global_context_feature = global_context_feature
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        # pyre-fixme[16]: `Qwen2ForCausalLM` has no attribute `generate`.
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+AutoConfig.register("cambrian_qwen", CambrianConfig)
+AutoModelForCausalLM.register(CambrianConfig, CambrianQwenForCausalLM)

longvu/make_delta.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# pyre-unsafe
+"""
+Usage:
+python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
+"""
+import argparse
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from .utils import auto_upgrade
+def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    print("Loading target model")
+    auto_upgrade(target_model_path)
+    target = AutoModelForCausalLM.from_pretrained(
+        target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    print("Calculating delta")
+    for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
+        if name not in base.state_dict():
+            assert name in [
+                "model.mm_projector.weight",
+                "model.mm_projector.bias",
+            ], f"{name} not in base model"
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data -= base.state_dict()[name]
+        else:
+            assert name in [
+                "model.embed_tokens.weight",
+                "lm_head.weight",
+            ], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
+            bparam = base.state_dict()[name]
+            param.data[: bparam.shape[0], : bparam.shape[1]] -= bparam
+    print("Saving delta")
+    if hub_repo_id:
+        kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
+    else:
+        kwargs = {}
+    target.save_pretrained(delta_path, **kwargs)
+    target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
+    target_tokenizer.save_pretrained(delta_path, **kwargs)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument("--hub-repo-id", type=str, default=None)
+    args = parser.parse_args()
+    make_delta(
+        args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id
+    )

longvu/mm_datautils.py ADDED Viewed

	@@ -0,0 +1,1688 @@

+# pyre-strict
+import copy
+import json
+import os
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Sequence
+import numpy as np
+import tokenizers
+import torch
+import transformers
+from longvu import conversation as conversation_lib
+from longvu.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+)
+# pyre-fixme[21]: Could not find module `decord`.
+from decord import cpu, VideoReader  # @manual=fbsource//third-party/pypi/decord:decord
+from packaging import version
+from PIL import Image
+from torch import distributed as dist
+from torch.distributed.fsdp import (
+    FullStateDictConfig,
+    FullyShardedDataParallel as FSDP,
+    StateDictType,
+)
+from torch.utils.data import Dataset
+# pyre-fixme
+IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse(
+    "0.14"
+)
+from transformers import StoppingCriteria
+from longvu.mm_utils import KeywordsStoppingCriteria
+# pyre-fixme[3]: Return type must be annotated.
+# pyre-fixme[2]: Parameter must be annotated.
+def maybe_zero_3(param, ignore_status: bool = False, name=None):
+    # NO deepspeed
+    # from deepspeed import zero
+    # from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    # if hasattr(param, "ds_id"):
+    #     if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+    #         if not ignore_status:
+    #             print(name, 'no ignore status')
+    #     with zero.GatheredParameters([param]):
+    #         param = param.data.detach().cpu().clone()
+    # else:
+    #     param = param.detach().cpu().clone()
+    return param.detach().cpu().clone()
+# pyre-fixme[3]: Return type must be annotated.
+# pyre-fixme[2]: Parameter must be annotated.
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {
+        k: t
+        for k, t in named_params
+        if any(key_match in k for key_match in keys_to_match)
+    }
+    to_return = {
+        k: maybe_zero_3(v, ignore_status=True, name=k).cpu()
+        for k, v in to_return.items()
+    }
+    return to_return
+# pyre-fixme[3]: Return type must be annotated.
+# pyre-fixme[2]: Parameter must be annotated.
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    multimodal_keywords = ["mm_projector", "vision_tower", "vision_resampler"]
+    for name, module in model.named_modules():
+        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
+            continue
+        if isinstance(module, cls):
+            names = name.split(".")
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+    if "lm_head" in lora_module_names:  # needed for 16-bit
+        lora_module_names.remove("lm_head")
+    return list(lora_module_names)
+def safe_save_model_for_hf_trainer(
+    trainer: transformers.Trainer, output_dir: str
+) -> None:
+    """Collects the state dict and dump to disk."""
+    global_rank = dist.get_rank()
+    save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+    # pyre-fixme[16]: `Trainer` has no attribute `args`.
+    if len(trainer.args.fsdp) == 0:
+        # pyre-fixme[16]: `Trainer` has no attribute `model`.
+        cpu_state_dict = trainer.model.state_dict()
+    else:
+        with FSDP.state_dict_type(
+            trainer.model, StateDictType.FULL_STATE_DICT, save_policy
+        ):
+            cpu_state_dict = trainer.model.state_dict()
+    for key in cpu_state_dict.keys():
+        cpu_state_dict[key] = cpu_state_dict[key].to(torch.bfloat16)
+    if global_rank == 0:
+        trainer.model.config.save_pretrained(output_dir)
+        current_folder = output_dir.split("/")[-1]
+        parent_folder = os.path.dirname(output_dir)
+        save_path = os.path.join(output_dir, "pytorch_model.bin")
+        if getattr(trainer.args, "tune_mm_mlp_adapter", False) and not getattr(
+            trainer.args, "tune_text_decoder", False
+        ):
+            # Only save Adapter
+            keys_to_match = ["mm_projector"]
+            if getattr(trainer.args, "use_im_start_end", False):
+                keys_to_match.extend(["embed_tokens", "embed_in"])
+            freeze_layer_remove = []
+            for key in cpu_state_dict.keys():
+                remove = True
+                for key_match in keys_to_match:
+                    if key_match in key:
+                        remove = False
+                        break
+                if remove:
+                    freeze_layer_remove.append(key)
+            for key in freeze_layer_remove:
+                del cpu_state_dict[key]
+            if current_folder.startswith("checkpoint-"):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                save_path = os.path.join(mm_projector_folder, f"{current_folder}.bin")
+            else:
+                save_path = os.path.join(output_dir, f"mm_projector.bin")
+        torch.save(cpu_state_dict, save_path)
+def smart_tokenizer_and_embedding_resize(
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+) -> None:
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    # pyre-fixme[16]: `PreTrainedModel` has no attribute `resize_token_embeddings`.
+    model.resize_token_embeddings(len(tokenizer))
+    if num_new_tokens > 0:
+        # pyre-fixme[16]: `PreTrainedModel` has no attribute `get_input_embeddings`.
+        input_embeddings = model.get_input_embeddings().weight.data
+        # pyre-fixme[16]: `PreTrainedModel` has no attribute `get_output_embeddings`.
+        output_embeddings = model.get_output_embeddings().weight.data
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True
+        )
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True
+        )
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+def _tokenize_fn(
+    strings: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        )
+        for text in strings
+    ]
+    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
+        for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+# pyre-fixme[2]: Parameter must be annotated.
+def _mask_targets(target, tokenized_lens, speakers) -> None:
+    # cur_idx = 0
+    cur_idx = tokenized_lens[0]
+    tokenized_lens = tokenized_lens[1:]
+    target[:cur_idx] = IGNORE_INDEX
+    for tokenized_len, speaker in zip(tokenized_lens, speakers):
+        if speaker == "human":
+            target[cur_idx + 2 : cur_idx + tokenized_len] = IGNORE_INDEX
+        cur_idx += tokenized_len
+# pyre-fixme[3]: Return type must be annotated.
+# pyre-fixme[2]: Parameter must be annotated.
+def _add_speaker_and_signal(header, source, get_conversation: bool = True):
+    """Add speaker and start/end signal on each round."""
+    BEGIN_SIGNAL = "### "
+    END_SIGNAL = "\n"
+    conversation = header
+    for sentence in source:
+        from_str = sentence["from"]
+        if from_str.lower() == "human":
+            from_str = conversation_lib.default_conversation.roles[0]
+        elif from_str.lower() == "gpt":
+            from_str = conversation_lib.default_conversation.roles[1]
+        else:
+            from_str = "unknown"
+        sentence["value"] = (
+            BEGIN_SIGNAL + from_str + ": " + sentence["value"] + END_SIGNAL
+        )
+        if get_conversation:
+            conversation += sentence["value"]
+    conversation += BEGIN_SIGNAL
+    return conversation
+# pyre-fixme[3]: Return type must be annotated.
+# pyre-fixme[2]: Parameter must be annotated.
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+# pyre-fixme[3]: Return type must be annotated.
+# pyre-fixme[2]: Parameter must be annotated.
+def process_images(images, image_processor, model_cfg):
+    if isinstance(image_processor, list):
+        processor_aux_list = image_processor
+        new_images_aux_list = []
+        for image in images:
+            if isinstance(image, np.ndarray):
+                image = Image.fromarray(image)
+            image_aux_list = []
+            for processor_aux in processor_aux_list:
+                image_aux = image
+                if hasattr(processor_aux, "image_mean"):
+                    try:
+                        target_resolution = processor_aux.crop_size["height"]
+                    except:
+                        target_resolution = processor_aux.size["height"]
+                    image_aux = expand2square(
+                        image_aux, tuple(int(x * 255) for x in processor_aux.image_mean)
+                    ).resize((target_resolution, target_resolution))
+                image_aux = processor_aux.preprocess(image_aux, return_tensors="pt")[
+                    "pixel_values"
+                ][0]
+                image_aux_list.append(image_aux)
+            new_images_aux_list.append(image_aux_list)
+        new_images_aux_list = [
+            list(batch_image_aux) for batch_image_aux in zip(*new_images_aux_list)
+        ]
+        new_images_aux_list = [
+            torch.stack(image_aux).half().cuda() for image_aux in new_images_aux_list
+        ]
+        return new_images_aux_list
+    else:
+        image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+        new_images = []
+        if image_aspect_ratio == "pad":
+            for image in images:
+                image = expand2square(
+                    image, tuple(int(x * 255) for x in image_processor.image_mean)
+                )
+                image = image_processor.preprocess(image, return_tensors="pt")[
+                    "pixel_values"
+                ][0]
+                new_images.append(image)
+        else:
+            return image_processor(images, return_tensors="pt")["pixel_values"]
+        if all(x.shape == new_images[0].shape for x in new_images):
+            new_images = torch.stack(new_images, dim=0)
+        return new_images
+# pyre-fixme[2]: Parameter must be annotated.
+# pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+#  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+def preprocess_multimodal(sources: Sequence[str], data_args) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        # pyre-fixme[7]: Expected `Dict[typing.Any, typing.Any]` but got
+        #  `Sequence[str]`.
+        return sources
+    for source in sources:
+        for sentence in source:
+            if (
+                # pyre-fixme[6]: For 1st argument expected `Union[slice, SupportsIndex]`
+                #  but got `str`.
+                DEFAULT_IMAGE_TOKEN in sentence["value"]
+                # pyre-fixme[6]: For 1st argument expected `Union[slice, SupportsIndex]`
+                #  but got `str`.
+                or "<video>" in sentence["value"]
+            ):
+                # pyre-fixme[16]: `str` has no attribute `__setitem__`.
+                sentence["value"] = (
+                    # pyre-fixme[6]: For 1st argument expected `Union[slice,
+                    #  SupportsIndex]` but got `str`.
+                    sentence["value"]
+                    .replace(DEFAULT_IMAGE_TOKEN, "")
+                    .replace("<video>", "")
+                    .strip()
+                )
+                # pyre-fixme[6]: For 1st argument expected `Union[slice,
+                #  SupportsIndex]` but got `str`.
+                sentence["value"] = DEFAULT_IMAGE_TOKEN + "\n" + sentence["value"]
+                # pyre-fixme[6]: For 1st argument expected `Union[slice,
+                #  SupportsIndex]` but got `str`.
+                sentence["value"] = sentence["value"].strip()
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    # pyre-fixme[6]: For 1st argument expected `Union[slice,
+                    #  SupportsIndex]` but got `str`.
+                    sentence["value"] = sentence["value"].replace(
+                        DEFAULT_IMAGE_TOKEN,
+                        "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>",
+                    )
+            replace_token = DEFAULT_IMAGE_TOKEN
+            if data_args.mm_use_im_start_end:
+                replace_token = (
+                    DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                )
+            # pyre-fixme[6]: For 1st argument expected `Union[slice, SupportsIndex]`
+            #  but got `str`.
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_IMAGE_TOKEN, replace_token
+            )
+    # pyre-fixme[7]: Expected `Dict[typing.Any, typing.Any]` but got `Sequence[str]`.
+    return sources
+def preprocess_llama_2(
+    # pyre-fixme[2]: Parameter must be annotated.
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
+    # Mask targets
+    sep = "[/INST] "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_v1(
+    # pyre-fixme[2]: Parameter must be annotated.
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            # pyre-fixme
+            if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14:
+                round_len -= 1
+                instruction_len -= 1
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+# pyre-fixme[3]: Return type must be annotated.
+def tokenizer_image_token(
+    # pyre-fixme[2]: Parameter must be annotated.
+    prompt,
+    # pyre-fixme[2]: Parameter must be annotated.
+    tokenizer,
+    # pyre-fixme[2]: Parameter must be annotated.
+    image_token_index=IMAGE_TOKEN_INDEX,
+    # pyre-fixme[2]: Parameter must be annotated.
+    return_tensors=None,
+):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if (
+        len(prompt_chunks) > 0
+        and len(prompt_chunks[0]) > 0
+        and prompt_chunks[0][0] == tokenizer.bos_token_id
+    ):
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+# pyre-fixme[3]: Return type must be annotated.
+def tokenizer_image_token_llama3(
+    # pyre-fixme[2]: Parameter must be annotated.
+    prompt,
+    # pyre-fixme[2]: Parameter must be annotated.
+    tokenizer,
+    # pyre-fixme[2]: Parameter must be annotated.
+    image_token_index=IMAGE_TOKEN_INDEX,
+    # pyre-fixme[2]: Parameter must be annotated.
+    return_tensors=None,
+):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    for x in insert_separator(prompt_chunks, [image_token_index]):
+        input_ids.extend(x)
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def preprocess_qwen(
+    # pyre-fixme[2]: Parameter must be annotated.
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    system_message: str = "You are a helpful assistant.",
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+) -> Dict:
+    # roles = {"human": "<|im_start|>user", "gpt": "<|im_start|>assistant"}
+    roles = {"human": "user", "gpt": "assistant"}
+    # Add image tokens to tokenizer as a special tokens
+    # Use a deepcopy of tokenizer so that we don't modify on the tokenizer
+    tokenizer = copy.deepcopy(tokenizer)
+    # When there is actually an image, we add the image tokens as a special token
+    if has_image:
+        tokenizer.add_tokens(["<image>"], special_tokens=True)
+    image_token_index = tokenizer.convert_tokens_to_ids("<image>")
+    im_start, im_end = tokenizer.additional_special_tokens_ids
+    # unmask_tokens = ["<|im_start|>", "<|im_start|>", "\n"]
+    unmask_tokens_idx = [198, im_start, im_end]
+    nl_tokens = tokenizer("\n").input_ids
+    # Reset Qwen chat templates so that it won't include system message every time we apply
+    chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+    tokenizer.chat_template = chat_template
+    # _system = tokenizer("system").input_ids + nl_tokens
+    # _user = tokenizer("user").input_ids + nl_tokens
+    # _assistant = tokenizer("assistant").input_ids + nl_tokens
+    # Apply prompt templates
+    input_ids, targets = [], []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != roles["human"]:
+            source = source[1:]
+        input_id, target = [], []
+        # New version, use apply chat template
+        # Build system message for each sentence
+        input_id += tokenizer.apply_chat_template(
+            [{"role": "system", "content": system_message}]
+        )
+        target += [IGNORE_INDEX] * len(input_id)
+        for conv in source:
+            # Make sure llava data can load
+            try:
+                role = conv["role"]
+                content = conv["content"]
+            except:
+                role = conv["from"]
+                content = conv["value"]
+            role = roles.get(role, role)
+            conv = [{"role": role, "content": content}]
+            encode_id = tokenizer.apply_chat_template(conv)
+            input_id += encode_id
+            if role in ["user", "system"]:
+                target += [IGNORE_INDEX] * len(encode_id)
+            else:
+                target += encode_id
+        assert len(input_id) == len(target), f"{len(input_id)} != {len(target)}"
+        for idx, encode_id in enumerate(input_id):
+            if encode_id in unmask_tokens_idx:
+                target[idx] = encode_id
+            if encode_id == image_token_index:
+                input_id[idx] = IMAGE_TOKEN_INDEX
+        input_ids.append(input_id)
+        targets.append(target)
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    targets = torch.tensor(targets, dtype=torch.long)
+    return dict(
+        input_ids=input_ids,  # tensor(bs x seq_len)
+        labels=targets,  # tensor(bs x seq_len)
+    )
+def preprocess_llama3(
+    # pyre-fixme[2]: Parameter must be annotated.
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    system_message: str = "You are a helpful assistant.",
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+) -> Dict:
+    # roles = {"human": "<|start_header_id|>user<|end_header_id|>", "gpt": "<|start_header_id|>assistant<|end_header_id|>"}
+    roles = {"human": "user", "gpt": "assistant"}
+    # Add image tokens to tokenizer as a special tokens
+    # Use a deepcopy of tokenizer so that we don't modify on the tokenizer
+    tokenizer = copy.deepcopy(tokenizer)
+    # When there is actually an image, we add the image tokens as a special token
+    if has_image:
+        tokenizer.add_tokens(["<image>"], special_tokens=True)
+    image_token_index = tokenizer.convert_tokens_to_ids("<image>")
+    bos_token_id = tokenizer.convert_tokens_to_ids("<|begin_of_text|>")
+    start_header_id = tokenizer.convert_tokens_to_ids("<|start_header_id|>")
+    end_header_id = tokenizer.convert_tokens_to_ids("<|end_header_id|>")
+    eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
+    unmask_tokens = [
+        "<|begin_of_text|>",
+        "<|start_header_id|>",
+        "<|end_header_id|>",
+        "<|eot_id|>",
+        "\n\n",
+    ]
+    unmask_tokens_idx = [tokenizer.convert_tokens_to_ids(tok) for tok in unmask_tokens]
+    # After update, calling tokenizer of llama3 will
+    # auto add bos id for the tokens. ヽ(｀⌒´)ﾉ
+    # pyre-fixme[53]: Captured variable `bos_token_id` is not annotated.
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+    def safe_tokenizer_llama3(text):
+        input_ids = tokenizer(text).input_ids
+        if input_ids[0] == bos_token_id:
+            input_ids = input_ids[1:]
+        return input_ids
+    nl_tokens = tokenizer.convert_tokens_to_ids("\n\n")
+    # chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{%- if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}"
+    chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
+    tokenizer.chat_template = chat_template
+    # Apply prompt templates
+    input_ids, targets = [], []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != roles["human"]:
+            source = source[1:]
+        input_id, target = [], []
+        # New version, use apply chat template
+        # Build system message for each sentence
+        input_id += tokenizer.apply_chat_template(
+            [{"role": "system", "content": system_message}]
+            # pyre-fixme[6]: For 1st argument expected `Union[int, str]` but got `slice`.
+        )[:-4]
+        target += [IGNORE_INDEX] * len(input_id)
+        for conv in source:
+            # Make sure llava data can load
+            try:
+                role = conv["role"]
+                content = conv["content"]
+            except:
+                role = conv["from"]
+                content = conv["value"]
+            role = roles.get(role, role)
+            conv = [{"role": role, "content": content}]
+            # First is bos token we don't need here
+            # pyre-fixme[6]: For 1st argument expected `Union[int, str]` but got
+            #  `slice`.
+            encode_id = tokenizer.apply_chat_template(conv)[1:-4]
+            input_id += encode_id
+            if role in ["user", "system"]:
+                target += [IGNORE_INDEX] * len(encode_id)
+            else:
+                target += encode_id
+        assert len(input_id) == len(target), f"{len(input_id)} != {len(target)}"
+        for idx, encode_id in enumerate(input_id):
+            if encode_id in unmask_tokens_idx:
+                target[idx] = encode_id
+            if encode_id == image_token_index:
+                input_id[idx] = IMAGE_TOKEN_INDEX
+        input_ids.append(input_id)
+        targets.append(target)
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    targets = torch.tensor(targets, dtype=torch.long)
+    print("input_ids", input_ids, flush=True)
+    print("targets", targets, flush=True)
+    return dict(
+        input_ids=input_ids,  # tensor(bs x seq_len)
+        labels=targets,  # tensor(bs x seq_len)
+    )
+def preprocess_llama_3_1(
+    # pyre-fixme[2]: Parameter must be annotated.
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            if sentence["from"] == "Answer":
+                sentence["from"] = "gpt"  # data bug
+            role = roles[sentence["from"]]
+            # assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    # remove the first bos token
+    if input_ids[0][0] == input_ids[0][1] == tokenizer.bos_token_id:
+        input_ids = input_ids[:, 1:]
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_3_1
+    # Mask targets
+    sep = "<|start_header_id|>" + conv.roles[1] + "<|end_header_id|>" + "\n\n"
+    # sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.shape[0])
+        rounds = conversation.split(conv.tokenizer.eos_token)
+        rounds = [rounds[0]] + [
+            rounds[idx] + rounds[idx + 1] for idx in range(1, len(rounds) - 1, 2)
+        ]
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2 and i != 0:
+                break
+            if i == 0:
+                round_len = len(tokenizer(rou, add_special_tokens=False).input_ids)
+                instruction_len = len(
+                    tokenizer(rou, add_special_tokens=False).input_ids
+                )
+            else:
+                parts[0] += sep
+                if has_image:
+                    round_len = len(tokenizer_image_token(rou, tokenizer)) + 1
+                    instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
+                else:
+                    round_len = len(tokenizer(rou).input_ids) + 1
+                    instruction_len = len(tokenizer(parts[0]).input_ids)
+            # if i > 0: round_len += 1
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        cur_len = cur_len + len(tokenizer(sep, add_special_tokens=False).input_ids)
+        # if cur_len > tokenizer.model_max_length: print(f"WARNING: max length context")
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_llama_3_2(
+    # pyre-fixme[2]: Parameter must be annotated.
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    # remove the first bos token
+    if input_ids[0][0] == input_ids[0][1] == tokenizer.bos_token_id:
+        input_ids = input_ids[:, 1:]
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_3_2
+    # Mask targets
+    sep = "<|start_header_id|>" + conv.roles[1] + "<|end_header_id|>" + "\n\n"
+    # sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.shape[0])
+        rounds = conversation.split(conv.tokenizer.eos_token)
+        rounds = [rounds[0]] + [
+            rounds[idx] + rounds[idx + 1] for idx in range(1, len(rounds) - 1, 2)
+        ]
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2 and i != 0:
+                break
+            if i == 0:
+                round_len = len(tokenizer(rou, add_special_tokens=False).input_ids)
+                instruction_len = len(
+                    tokenizer(rou, add_special_tokens=False).input_ids
+                )
+            else:
+                parts[0] += sep
+                if has_image:
+                    round_len = len(tokenizer_image_token(rou, tokenizer)) + 1
+                    instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
+                else:
+                    round_len = len(tokenizer(rou).input_ids) + 1
+                    instruction_len = len(tokenizer(parts[0]).input_ids)
+            # if i > 0: round_len += 1
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        cur_len = cur_len + len(tokenizer(sep, add_special_tokens=False).input_ids)
+        # if cur_len > tokenizer.model_max_length: print(f"WARNING: max length context")
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_phi3(
+    # pyre-fixme[2]: Parameter must be annotated.
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+) -> Dict:
+    conv = conversation_lib.conv_templates["phi3"].copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
+    # Mask targets
+    sep = conv.sep + conv.roles[1]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep)
+        re_rounds = [conv.sep.join(rounds[:3])]  # system + user + gpt
+        for conv_idx in range(3, len(rounds), 2):
+            re_rounds.append(
+                conv.sep.join(rounds[conv_idx : conv_idx + 2])
+            )  # user + gpt
+        cur_len = 0
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+            if i == 0:
+                round_len += 1
+                instruction_len += 1
+            else:
+                round_len -= 2
+                instruction_len -= 2
+            if (
+                i != 0
+                and getattr(tokenizer, "legacy", False)
+                and IS_TOKENIZER_GREATER_THAN_0_14
+            ):
+                round_len += 1
+                instruction_len += 1
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_mpt(
+    # pyre-fixme[2]: Parameter must be annotated.
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
+    # Mask targets
+    sep = conv.sep + conv.roles[1]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep)
+        re_rounds = [conv.sep.join(rounds[:3])]  # system + user + gpt
+        for conv_idx in range(3, len(rounds), 2):
+            re_rounds.append(
+                conv.sep.join(rounds[conv_idx : conv_idx + 2])
+            )  # user + gpt
+        cur_len = 0
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+            if (
+                i != 0
+                and getattr(tokenizer, "legacy", False)
+                and IS_TOKENIZER_GREATER_THAN_0_14
+            ):
+                round_len += 1
+                instruction_len += 1
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        # pyre-fixme[6]: For 1st argument expected `Union[slice, SupportsIndex]` but
+        #  got `str`.
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        # pyre-fixme[16]: `str` has no attribute `__setitem__`.
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = (
+            # pyre-fixme[6]: For 1st argument expected `Union[slice, SupportsIndex]`
+            #  but got `str`.
+            source[0]["value"]
+            # pyre-fixme[6]: For 1st argument expected `Union[slice, SupportsIndex]`
+            #  but got `str`.
+            + source[1]["value"]
+            + conversation_lib.default_conversation.sep
+        )
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [
+        tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+        for prompt in conversations
+    ]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        # pyre-fixme[6]: For 1st argument expected `Union[slice, SupportsIndex]` but
+        #  got `str`.
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+    return dict(input_ids=input_ids, labels=targets)
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+) -> Dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if (
+        conversation_lib.default_conversation.sep_style
+        == conversation_lib.SeparatorStyle.PLAIN
+    ):
+        return preprocess_plain(sources, tokenizer)
+    if (
+        conversation_lib.default_conversation.sep_style
+        == conversation_lib.SeparatorStyle.LLAMA_2
+    ):
+        return preprocess_llama_2(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version.startswith("v1"):
+        return preprocess_v1(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "mpt":
+        return preprocess_mpt(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "llama3":
+        return preprocess_llama3(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "llama3_1":
+        return preprocess_llama_3_1(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "llama3_2":
+        return preprocess_llama_3_2(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "phi3":
+        return preprocess_phi3(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "qwen":
+        return preprocess_qwen(sources, tokenizer, has_image=has_image)
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        header = f"{conversation_lib.default_conversation.system}\n\n"
+        conversation = _add_speaker_and_signal(header, source)
+        conversations.append(conversation)
+    # tokenize conversations
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+    def get_tokenize_len(prompts):
+        return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts]
+    if has_image:
+        input_ids = [
+            tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+            for prompt in conversations
+        ]
+    else:
+        conversations_tokenized = _tokenize_fn(conversations, tokenizer)
+        input_ids = conversations_tokenized["input_ids"]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        if has_image:
+            # pyre-fixme[61]: `header` is undefined, or not always defined.
+            # pyre-fixme[6]: For 1st argument expected `Union[slice, SupportsIndex]`
+            #  but got `str`.
+            tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source])
+        else:
+            tokenized_lens = _tokenize_fn(
+                # pyre-fixme[61]: `header` is undefined, or not always defined.
+                # pyre-fixme[6]: For 1st argument expected `Union[slice,
+                #  SupportsIndex]` but got `str`.
+                [header] + [s["value"] for s in source],
+                tokenizer,
+            )["input_ids_lens"]
+        # pyre-fixme[6]: For 1st argument expected `Union[slice, SupportsIndex]` but
+        #  got `str`.
+        speakers = [sentence["from"] for sentence in source]
+        _mask_targets(target, tokenized_lens, speakers)
+    return dict(input_ids=input_ids, labels=targets)
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(
+        self,
+        data_path: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        # pyre-fixme[2]: Parameter must be annotated.
+        data_args,
+    ) -> None:
+        super(LazySupervisedDataset, self).__init__()
+        list_data_dict = json.load(open(data_path, "r"))
+        self.tokenizer = tokenizer
+        # pyre-fixme[4]: Attribute must be annotated.
+        self.list_data_dict = list_data_dict
+        # pyre-fixme[4]: Attribute must be annotated.
+        self.data_args = data_args
+    @property
+    # pyre-fixme[3]: Return type must be annotated.
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 128 if "image" in sample else 0
+            length_list.append(
+                sum(len(conv["value"].split()) for conv in sample["conversations"])
+                + img_tokens
+            )
+        return length_list
+    @property
+    def modality_lengths(self) -> List[int]:
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(
+                len(conv["value"].split()) for conv in sample["conversations"]
+            )
+            cur_len = (
+                cur_len if ("image" in sample) or ("video" in sample) else -cur_len
+            )
+            length_list.append(cur_len)
+        return length_list
+    def __len__(self) -> int:
+        return len(self.list_data_dict)
+    def __getitem__(self, i: int) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        has_image = True
+        if "image" in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            image_folder = self.data_args.image_folder
+            processor = self.data_args.image_processor
+            full_path = os.path.join(image_folder, image_file)
+            if not os.path.exists(full_path):
+                print(full_path)
+                has_image = False
+                sources = copy.deepcopy([e["conversations"] for e in sources])
+            else:
+                image = Image.open(full_path).convert("RGB")
+                if self.data_args.image_aspect_ratio == "sam":
+                    image = np.array(image)[:, :, ::-1]
+                if self.data_args.image_aspect_ratio == "pad":
+                    # pyre-fixme[3]: Return type must be annotated.
+                    # pyre-fixme[2]: Parameter must be annotated.
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(
+                                pil_img.mode, (width, width), background_color
+                            )
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(
+                                pil_img.mode, (height, height), background_color
+                            )
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+                    image = expand2square(
+                        image, tuple(int(x * 255) for x in processor.image_mean)
+                    )
+                    image = processor.preprocess(image, return_tensors="pt")[
+                        "pixel_values"
+                    ][0]
+                else:
+                    if self.data_args.image_aspect_ratio != "sam":
+                        image = processor.preprocess(image, return_tensors="pt")[
+                            "pixel_values"
+                        ][0]
+                sources = preprocess_multimodal(
+                    copy.deepcopy([e["conversations"] for e in sources]), self.data_args
+                )
+        elif "video" in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_folder = self.data_args.image_folder
+            if "webvid" in video_folder:
+                video_file = os.path.join(video_folder, "videos", video_file)
+            elif "ActivityNet" in video_folder:
+                video_file = os.path.join(video_folder, "train_val", video_file)
+            else:
+                video_file = os.path.join(video_folder, video_file)
+            if not os.path.exists(video_file):
+                print("nonexist: {}".format(video_file), flush=True)
+                for sub_folder in os.listdir(video_folder):
+                    if os.path.isdir(os.path.join(video_folder, sub_folder)):
+                        for sub_sub_folder in os.listdir(
+                            os.path.join(video_folder, sub_folder)
+                        ):
+                            print("folder", sub_folder, sub_sub_folder)
+                has_image = False
+                sources = copy.deepcopy([e["conversations"] for e in sources])
+            else:
+                if video_file.endswith(".webm"):
+                    has_image = False
+                    sources = copy.deepcopy([e["conversations"] for e in sources])
+                else:
+                    try:
+                        # if video_file.endswith(".webm"):
+                        #     video_webm = VideoFileClip(video_file)
+                        #     video_frames = np.array(list(video_webm.iter_frames()))
+                        #     sample_fps = round(video_webm.fps / self.data_args.video_fps)
+                        #     frame_idx = [i for i in range(0, len(video_frames), sample_fps)]
+                        #     video = video_frames[frame_idx]
+                        # else:
+                        vr = VideoReader(video_file, ctx=cpu(0), num_threads=1)
+                        sample_fps = round(vr.get_avg_fps() / self.data_args.video_fps)
+                        frame_idx = [i for i in range(0, len(vr), sample_fps)]
+                        video = vr.get_batch(frame_idx).asnumpy()
+                        if self.data_args.image_aspect_ratio == "sam":
+                            image = video[:, :, :, ::-1][:100]
+                        else:
+                            processor = self.data_args.image_processor
+                            image = processor.preprocess(video, return_tensors="pt")[
+                                "pixel_values"
+                            ]
+                        sources = preprocess_multimodal(
+                            copy.deepcopy([e["conversations"] for e in sources]),
+                            self.data_args,
+                        )
+                    except:
+                        has_image = False
+                        sources = copy.deepcopy([e["conversations"] for e in sources])
+        else:
+            has_image = False
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+        data_dict = preprocess(
+            # pyre-fixme[6]: For 1st argument expected `Sequence[str]` but got
+            #  `Union[Dict[typing.Any, typing.Any], List[typing.Any]]`.
+            sources,
+            self.tokenizer,
+            has_image=has_image,
+        )
+        if isinstance(i, int):
+            data_dict = dict(
+                input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0]
+            )
+        # image exist in the data
+        if has_image:
+            if "image" in self.list_data_dict[i]:
+                # pyre-fixme[61]: Local variable `image` is undefined, or not always defined.
+                data_dict["image"] = image
+            elif "video" in self.list_data_dict[i]:
+                # pyre-fixme[61]: Local variable `image` is undefined, or not always defined.
+                data_dict["image"] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            # crop_size = self.data_args.image_processor.crop_size
+            # data_dict["image"] = torch.zeros(3, crop_size["height"], crop_size["width"])
+            if self.data_args.image_aspect_ratio == "sam":
+                if "video" in self.list_data_dict[i]:
+                    data_dict["image"] = np.zeros((1, 1024, 1024, 3)).astype(np.uint8)
+                else:
+                    data_dict["image"] = np.zeros((1024, 1024, 3)).astype(np.uint8)
+            else:
+                crop_size = self.data_args.image_processor.crop_size
+                if "video" in self.list_data_dict[i]:
+                    data_dict["image"] = torch.zeros(
+                        1, 3, crop_size["height"], crop_size["width"]
+                    )
+                else:
+                    data_dict["image"] = torch.zeros(
+                        3, crop_size["height"], crop_size["width"]
+                    )
+        if has_image:
+            if self.data_args.num_points > 0:
+                if "box" in self.list_data_dict[i]:
+                    x1, y1, x2, y2 = self.list_data_dict[i]["box"]
+                    points = []
+                    x = random.uniform(x1, x2)
+                    y = random.uniform(y1, y2)
+                    points.append(torch.tensor([x, y, 1]))
+                    for _ in range(1, self.data_args.num_points):
+                        points.append(torch.tensor([0, 0, 0]))
+                    points = torch.stack(points, dim=0)
+                    data_dict["point"] = points
+                else:
+                    if "point" in self.list_data_dict[i]:
+                        points = torch.tensor(self.list_data_dict[i]["point"])
+                        data_dict["point"] = points
+                    else:
+                        points = []
+                        grid = int(np.sqrt(self.data_args.num_points))
+                        height, width = image.shape[0], image.shape[1]
+                        for i in range(grid):
+                            for j in range(grid):
+                                points.append(
+                                    torch.tensor(
+                                        [
+                                            width / grid / 2.0 + i / grid * width,
+                                            height / grid / 2.0 + j / grid * height,
+                                            1,
+                                        ]
+                                    )
+                                )
+                        points = torch.stack(points, dim=0)
+                        data_dict["point"] = points
+        elif self.data_args.is_multimodal:
+            if self.data_args.num_points > 0:
+                points = []
+                grid = int(np.sqrt(self.data_args.num_points))
+                height, width = data_dict["image"].shape[0], data_dict["image"].shape[1]
+                for i in range(grid):
+                    for j in range(grid):
+                        points.append(
+                            torch.tensor(
+                                [
+                                    width / grid / 2.0 + i / grid * width,
+                                    height / grid / 2.0 + j / grid * height,
+                                    1,
+                                ]
+                            )
+                        )
+                points = torch.stack(points, dim=0)
+                data_dict["point"] = points
+        return data_dict
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels")
+        )
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            # pyre-fixme[6]: For 3rd argument expected `float` but got `Optional[int]`.
+            padding_value=self.tokenizer.pad_token_id,
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX
+        )
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+        labels = labels[:, : self.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            # pyre-fixme[6]: For 1st argument expected `Tensor` but got `Optional[int]`.
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        # if "image" in instances[0]:
+        #     images = [instance["image"] for instance in instances]
+        #     if all(x is not None and x.shape == images[0].shape for x in images):
+        #         if type(images[0]) is torch.Tensor:
+        #             batch["images"] = torch.stack(images)
+        #         else:
+        #
+        #             batch["images"] = np.stack(images)
+        #     else:
+        #
+        #         #  `List[typing.Any]`.
+        #         batch["images"] = images
+        if "image" in instances[0]:
+            images = [instance["image"] for instance in instances]
+            # pyre-fixme[6]: For 2nd argument expected `Tensor` but got `List[typing.Any]`.
+            batch["images"] = images
+            if "point" in instances[0]:
+                points = [instance["point"] for instance in instances]
+                batch["points"] = torch.stack(points)
+        return batch
+def make_supervised_data_module(
+    tokenizer: transformers.PreTrainedTokenizer,
+    # pyre-fixme[2]: Parameter must be annotated.
+    data_args,
+    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
+    #  `typing.Dict[<key type>, <value type>]` to avoid runtime subscripting errors.
+) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(
+        tokenizer=tokenizer, data_path=data_args.data_path, data_args=data_args
+    )
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(
+        train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator
+    )

longvu/mm_utils.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import ast
+import base64
+import math
+from io import BytesIO
+import torch
+from longvu.constants import IMAGE_TOKEN_INDEX
+from PIL import Image
+from transformers import StoppingCriteria
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(
+            original_height * scale
+        )
+        effective_resolution = min(
+            downscaled_width * downscaled_height, original_width * original_height
+        )
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (
+            effective_resolution == max_effective_resolution
+            and wasted_resolution < min_wasted_resolution
+        ):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions)
+    return width // patch_size, height // patch_size
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    patches = divide_to_patches(image_padded, processor.crop_size["height"])
+    image_original_resize = image.resize(
+        (processor.size["shortest_edge"], processor.size["shortest_edge"])
+    )
+    image_patches = [image_original_resize] + patches
+    image_patches = [
+        processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0]
+        for image_patch in image_patches
+    ]
+    return torch.stack(image_patches, dim=0)
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+# def process_images(images, image_processor, model_cfg):
+#     image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+#     new_images = []
+#     if image_aspect_ratio == 'pad':
+#         for image in images:
+#             image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
+#             image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+#             new_images.append(image)
+#     elif image_aspect_ratio == "anyres":
+#         for image in images:
+#             image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+#             new_images.append(image)
+#     else:
+#         return image_processor(images, return_tensors='pt')['pixel_values']
+#     if all(x.shape == new_images[0].shape for x in new_images):
+#         new_images = torch.stack(new_images, dim=0)
+#     return new_images
+# multiple vision towers
+def process_images(images, image_processor, model_cfg):
+    processor_aux_list = image_processor
+    new_images_aux_list = []
+    for image in images:
+        image_aux_list = []
+        for processor_aux in processor_aux_list:
+            image_aux = image
+            if hasattr(processor_aux, "image_mean"):
+                try:
+                    target_resolution = processor_aux.crop_size["height"]
+                except:
+                    target_resolution = processor_aux.size["height"]
+                image_aux = expand2square(
+                    image_aux, tuple(int(x * 255) for x in processor_aux.image_mean)
+                ).resize((target_resolution, target_resolution))
+            image_aux = processor_aux.preprocess(image_aux, return_tensors="pt")[
+                "pixel_values"
+            ][0]
+            image_aux_list.append(image_aux)
+        new_images_aux_list.append(image_aux_list)
+    new_images_aux_list = [
+        list(batch_image_aux) for batch_image_aux in zip(*new_images_aux_list)
+    ]
+    new_images_aux_list = [
+        torch.stack(image_aux).half().cuda() for image_aux in new_images_aux_list
+    ]
+    return new_images_aux_list
+def tokenizer_image_token(
+    prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None
+):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if (
+        len(prompt_chunks) > 0
+        and len(prompt_chunks[0]) > 0
+        and prompt_chunks[0][0] == tokenizer.bos_token_id
+    ):
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def tokenizer_image_token_llama3(
+    prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None
+):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    for x in insert_separator(prompt_chunks, [image_token_index]):
+        input_ids.extend(x)
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if (
+                len(cur_keyword_ids) > 1
+                and cur_keyword_ids[0] == tokenizer.bos_token_id
+            ):
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def call_for_batch(
+        self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [
+            keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids
+        ]
+        for keyword_id in self.keyword_ids:
+            truncated_output_ids = output_ids[0, -keyword_id.shape[0] :]
+            if torch.equal(truncated_output_ids, keyword_id):
+                return True
+        outputs = self.tokenizer.batch_decode(
+            output_ids[:, -offset:], skip_special_tokens=True
+        )[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    def __call__(
+        self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            # pyre-fixme[6]: For 1st argument expected `LongTensor` but got `Tensor`.
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)

longvu/multimodal_encoder/__pycache__/base_encoder.cpython-310.pyc ADDED Viewed

Binary file (4.33 kB). View file

longvu/multimodal_encoder/__pycache__/builder.cpython-310.pyc ADDED Viewed

Binary file (1 kB). View file

longvu/multimodal_encoder/__pycache__/dino_encoder.cpython-310.pyc ADDED Viewed

Binary file (3.67 kB). View file

longvu/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc ADDED Viewed

Binary file (2.6 kB). View file

longvu/multimodal_encoder/base_encoder.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+class ProcessorWrapper:
+    def __init__(
+        self,
+        transform,
+        height=378,
+        width=378,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+    ):
+        self._crop_size = {
+            "height": height,
+            "width": width,
+        }
+        self._transforms = transform
+        # print(transform)
+        self.image_mean = image_mean
+    @property
+    def crop_size(self):
+        return self._crop_size
+    def preprocess(self, image, return_tensors="pt"):
+        # Ensure image is a PIL Image
+        output = {}
+        output["pixel_values"] = [self._transforms(image)]
+        return output
+class BaseVisionTower(nn.Module):
+    def __init__(self, vision_tower_name, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.args = args
+        self.vision_tower_name = vision_tower_name
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+        self.unfreeze_mm_vision_tower = getattr(args, "unfreeze_mm_vision_tower", False)
+        self.delay_load = delay_load
+    @abstractmethod
+    def load_model(self, device_map=None):
+        raise NotImplementedError("Subclasses must implement load_model")
+    @abstractmethod
+    def _forward(self, images):
+        raise NotImplementedError("Subclasses must implement forward")
+    def forward(self, images):
+        if type(images) is list:
+            image_features = [self._forward(image.unsqueeze(0)) for image in images]
+        else:
+            image_features = self._forward(images)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        # Dynamically infer the dtype from the first parameter, if not explicitly specified
+        if hasattr(self.vision_tower, "dtype"):
+            return self.vision_tower.dtype
+        else:
+            params = list(self.vision_tower.parameters())
+            return (
+                params[0].dtype if len(params) > 0 else torch.float32
+            )  # Default to torch.float32 if no parameters
+    @property
+    def device(self):
+        # Dynamically infer the device from the first parameter, if not explicitly specified
+        if hasattr(self.vision_tower, "device"):
+            return self.vision_tower.device
+        else:
+            params = list(self.vision_tower.parameters())
+            return (
+                params[0].device if len(params) > 0 else torch.device("cpu")
+            )  # Default to CPU if no parameters
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        try:
+            return self.config.hidden_size
+        except:
+            return self._hidden_size
+    @property
+    def image_size(self):  # resolution
+        # return self.config.image_size
+        try:
+            return self.config.image_size
+        except:
+            return self._image_size
+    @property
+    def patch_size(self):
+        # return self.config.patch_size
+        try:
+            return self.config.patch_size
+        except:
+            return self._patch_size
+    @property
+    def num_patches_per_side(self):
+        if self._interp_size is not None:
+            return int(self._interp_size**0.5)
+        try:
+            return self.image_size // self.patch_size
+        except:
+            return self._num_patches_per_side
+    @property
+    def num_patches(self):
+        if self._interp_size is not None:
+            return self._interp_size
+        try:
+            return self.num_patches_per_side**2
+        except:
+            return self._num_patches

longvu/multimodal_encoder/builder.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# pyre-unsafe
+import copy
+from .dino_encoder import DinoVisionTower
+from .siglip_encoder import SiglipVisionTower
+def build_vision_tower_aux_list(vision_tower_cfg, **kwargs):
+    vision_tower_aux_name_list = getattr(
+        vision_tower_cfg,
+        "mm_vision_tower_aux_list",
+        getattr(vision_tower_cfg, "vision_tower_aux_list", None),
+    )
+    vision_tower_aux_token_len_list = getattr(
+        vision_tower_cfg,
+        "mm_vision_tower_aux_token_len_list",
+        getattr(vision_tower_cfg, "vision_tower_aux_token_len_list", None),
+    )
+    vision_tower_aux_list = []
+    for vision_tower_aux_name, vision_tower_aux_token_len in zip(
+        vision_tower_aux_name_list, vision_tower_aux_token_len_list
+    ):
+        config = copy.deepcopy(vision_tower_cfg)
+        vision_tower_aux_name += "-interp{}".format(vision_tower_aux_token_len)
+        if "siglip" in vision_tower_aux_name.lower():
+            vision_tower_aux_list.append(
+                SiglipVisionTower(vision_tower_aux_name, args=config, **kwargs)
+            )
+        # SSL-based Vision Towers
+        elif "dinov2" in vision_tower_aux_name.lower():
+            vision_tower_aux_list.append(
+                DinoVisionTower(vision_tower_aux_name, args=config, **kwargs)
+            )
+        else:
+            raise ValueError(f"Unknown vision tower: {vision_tower_aux_name}")
+    return vision_tower_aux_list

longvu/multimodal_encoder/dino_encoder.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import torch
+import torch.nn.functional as F
+from transformers import AutoImageProcessor, Dinov2Config, Dinov2Model
+from .base_encoder import BaseVisionTower, ProcessorWrapper
+class DinoVisionTower(BaseVisionTower):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super(DinoVisionTower, self).__init__(vision_tower, args, delay_load)
+        model_path = "./checkpoints/dinov2-giant"
+        base_model_name, res, interp = model_path, 378, 576
+        self._vision_tower_name = vision_tower
+        self.vision_tower_name = base_model_name
+        self._image_size = res
+        self._interp_size = interp
+        self._patch_size = 14  # default patch size
+        if not self.delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = Dinov2Config.from_pretrained(self.vision_tower_name)
+    def load_model(self, device_map=None):
+        self.vision_tower = Dinov2Model.from_pretrained(self.vision_tower_name)
+        """ValueError: Dinov2Model does not support `device_map='auto'`. To implement support, the model class needs to implement the `_no_split_modules` attribute."""
+        self.vision_tower._no_split_modules = ["Dinov2SwiGLUFFN"]
+        _image_size = self.vision_tower.config.image_size
+        if self._image_size is None:
+            self._image_size = _image_size
+        # increase shortest edge to prevent edge case crops
+        default_shortest_ratio = 8 / 7  # 224/256
+        # shortest_edge = int(default_shortest_ratio * self._image_size)
+        shortest_edge = self._image_size
+        processor = AutoImageProcessor.from_pretrained(
+            self.vision_tower_name,
+            crop_size=dict(height=self._image_size, width=self._image_size),
+            size=dict(shortest_edge=shortest_edge),
+        )
+        self.image_processor = processor
+        # Assign the output channels of the projection convolution as the hidden size
+        self._hidden_size = (
+            self.vision_tower.embeddings.patch_embeddings.projection.out_channels
+        )
+        # Assign the first value of the stride of the projection convolution as the patch size
+        self._patch_size = (
+            self.vision_tower.embeddings.patch_embeddings.projection.stride[0]
+        )
+        # print(self._hidden_size, self._patch_size)
+        self.vision_tower.requires_grad_(self.unfreeze_mm_vision_tower)
+        self.is_loaded = True
+    @property
+    def image_size(self):
+        return self._image_size
+    def feature_select(self, outputs):
+        sequence_output = outputs[
+            "last_hidden_state"
+        ]  # batch_size, sequence_length, hidden_size
+        if self.select_feature == "cls_patch":
+            image_features = sequence_output
+        elif self.select_feature == "patch":
+            image_features = sequence_output[:, 1:]
+        elif self.select_feature == "cls":
+            image_features = sequence_output[:, 0]
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+    def interpolate(self, image_features):
+        if self._interp_size is None:
+            return image_features
+        b, num_tokens, dim = image_features.shape
+        if num_tokens != self.num_patches:
+            target_h = target_w = int(self._interp_size**0.5)
+            h = w = int(num_tokens**0.5)
+            image_features = image_features.view(b, h, w, dim)
+            image_features = image_features.permute(0, 3, 1, 2).contiguous()
+            image_features = F.interpolate(
+                image_features.to(torch.float32),
+                size=(target_h, target_w),
+                mode="bilinear",
+                align_corners=False,
+            ).to(image_features.dtype)
+            # Permute the dimensions back to (b, target_h, target_w, dim)
+            image_features = image_features.permute(0, 2, 3, 1).contiguous()
+            # Flatten the spatial dimensions (target_h, target_w) into a single dimension
+            image_features = image_features.flatten(1, 2)
+        return image_features
+    def _forward(self, images):
+        # logger.warning(f"images shape: {images.shape}")
+        with torch.set_grad_enabled(self.unfreeze_mm_vision_tower):
+            image_forward_outs = self.vision_tower.forward(
+                images.to(device=self.device, dtype=self.dtype)
+            )
+            # logger.warning(f"image_forward_outs shape: {image_forward_outs['last_hidden_state'].shape}")
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+            # logger.warning(f"image_features shape: {image_features.shape}")
+            interp_features = self.interpolate(image_features)
+            # logger.warning(f"interp_features shape: {interp_features.shape}")
+            return interp_features
+    @property
+    def num_patches_per_side(self):
+        return int(self.num_patches**0.5)
+    @property
+    def num_patches(self):
+        if self._interp_size is None:
+            return (self._image_size // self._patch_size) ** 2
+        else:
+            return self._interp_size

longvu/multimodal_encoder/drop.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2023-present, BAAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------
+# pyre-unsafe
+"""Drop regularization layers."""
+from torch import nn
+class DropPathV(nn.Module):
+    """Set examples to zero randomly."""
+    def __init__(self, p=0.1, inplace=False):
+        super(DropPathV, self).__init__()
+        self.p = p
+        self.inplace = inplace
+    def forward(self, input):
+        if not self.training or self.p <= 0:
+            return input
+        keep_p = 1 - self.p
+        shape = (input.shape[0],) + (1,) * (input.dim() - 1)
+        scale = input.new_empty(shape).bernoulli_(keep_p).div_(keep_p)
+        return input.mul_(scale) if self.inplace else input.mul(scale)
+    def extra_repr(self):
+        inplace_str = ", inplace" if self.inplace else ""
+        return "p={}{}".format(self.p, inplace_str)

longvu/multimodal_encoder/image.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2023-present, BAAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------
+# pyre-unsafe
+"""Image utilities."""
+import numpy as np
+import PIL.Image
+import torch
+def im_resize(img, size=None, scale=None, mode="linear"):
+    """Resize image by the scale or size."""
+    if size is None:
+        if not isinstance(scale, (tuple, list)):
+            scale = (scale, scale)
+        h, w = img.shape[:2]
+        size = int(h * scale[0] + 0.5), int(w * scale[1] + 0.5)
+    else:
+        if not isinstance(size, (tuple, list)):
+            size = (size, size)
+    resize_modes = {"linear": PIL.Image.BILINEAR}
+    from torchvision.transforms import ToPILImage
+    to_pil = ToPILImage()
+    img = to_pil(img.to(torch.float32).cpu())
+    # img = PIL.Image.fromarray(img)
+    return np.array(img.resize(size[::-1], resize_modes[mode]))
+def im_rescale(img, scales, max_size=0):
+    """Rescale image to match the detecting scales."""
+    im_shape = img.shape
+    img_list, img_scales = [], []
+    size_min = np.min(im_shape[:2])
+    size_max = np.max(im_shape[:2])
+    for target_size in scales:
+        im_scale = float(target_size) / float(size_min)
+        target_size_max = max_size if max_size > 0 else target_size
+        if np.round(im_scale * size_max) > target_size_max:
+            im_scale = float(target_size_max) / float(size_max)
+        img_list.append(im_resize(img, scale=im_scale))
+        img_scales.append((im_scale, im_scale))
+    return img_list, img_scales
+def im_vstack(arrays, fill_value=None, dtype=None, size=None, align=None):
+    """Stack image arrays in sequence vertically."""
+    if fill_value is None:
+        return np.vstack(arrays)
+    # Compute the max stack shape.
+    max_shape = np.max(np.stack([arr.shape for arr in arrays]), 0)
+    if size is not None and min(size) > 0:
+        max_shape[: len(size)] = size
+    if align is not None and min(align) > 0:
+        align_size = np.ceil(max_shape[: len(align)] / align)
+        max_shape[: len(align)] = align_size.astype("int64") * align
+    # Fill output with the given value.
+    output_dtype = dtype or arrays[0].dtype
+    output_shape = [len(arrays)] + list(max_shape)
+    output = np.empty(output_shape, output_dtype)
+    output[:] = fill_value
+    # Copy arrays.
+    for i, arr in enumerate(arrays):
+        copy_slices = (slice(0, d) for d in arr.shape)
+        output[(i,) + tuple(copy_slices)] = arr
+    return output

longvu/multimodal_encoder/logging.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2023-present, BAAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, esither express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------
+# pyre-unsafe
+"""Logging utilities."""
+import inspect
+import logging as _logging
+import os
+import sys as _sys
+import threading
+_logger = None
+_logger_lock = threading.Lock()
+def get_logger():
+    global _logger
+    # Use double-checked locking to avoid taking lock unnecessarily.
+    if _logger:
+        return _logger
+    _logger_lock.acquire()
+    try:
+        if _logger:
+            return _logger
+        logger = _logging.getLogger("tokenize-anything")
+        logger.setLevel("INFO")
+        logger.propagate = False
+        logger._is_root = True
+        if True:
+            # Determine whether we are in an interactive environment.
+            _interactive = False
+            try:
+                # This is only defined in interactive shells.
+                if _sys.ps1:
+                    _interactive = True
+            except AttributeError:
+                # Even now, we may be in an interactive shell with `python -i`.
+                _interactive = _sys.flags.interactive
+            # If we are in an interactive environment (like Jupyter), set loglevel
+            # to INFO and pipe the output to stdout.
+            if _interactive:
+                logger.setLevel("INFO")
+                _logging_target = _sys.stdout
+            else:
+                _logging_target = _sys.stderr
+            # Add the output handler.
+            _handler = _logging.StreamHandler(_logging_target)
+            _handler.setFormatter(_logging.Formatter("%(levelname)s %(message)s"))
+            logger.addHandler(_handler)
+        _logger = logger
+        return _logger
+    finally:
+        _logger_lock.release()
+def _detailed_msg(msg):
+    file, lineno = inspect.stack()[:3][2][1:3]
+    return "{}:{}] {}".format(os.path.split(file)[-1], lineno, msg)
+def log(level, msg, *args, **kwargs):
+    get_logger().log(level, _detailed_msg(msg), *args, **kwargs)
+def debug(msg, *args, **kwargs):
+    if is_root():
+        get_logger().debug(_detailed_msg(msg), *args, **kwargs)
+def error(msg, *args, **kwargs):
+    get_logger().error(_detailed_msg(msg), *args, **kwargs)
+    assert 0
+def fatal(msg, *args, **kwargs):
+    get_logger().fatal(_detailed_msg(msg), *args, **kwargs)
+    assert 0
+def info(msg, *args, **kwargs):
+    if is_root():
+        get_logger().info(_detailed_msg(msg), *args, **kwargs)
+def warning(msg, *args, **kwargs):
+    if is_root():
+        get_logger().warning(_detailed_msg(msg), *args, **kwargs)
+def get_verbosity():
+    """Return how much logging output will be produced."""
+    return get_logger().getEffectiveLevel()
+def set_verbosity(v):
+    """Set the threshold for what messages will be logged."""
+    get_logger().setLevel(v)
+def set_formatter(fmt=None, datefmt=None):
+    """Set the formatter."""
+    handler = _logging.StreamHandler(_sys.stderr)
+    handler.setFormatter(_logging.Formatter(fmt, datefmt))
+    logger = get_logger()
+    logger.removeHandler(logger.handlers[0])
+    logger.addHandler(handler)
+def set_root(is_root=True):
+    """Set logger to the root."""
+    get_logger()._is_root = is_root
+def is_root():
+    """Return logger is the root."""
+    return get_logger()._is_root

longvu/multimodal_encoder/loss.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2023-present, BAAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------
+# pyre-unsafe
+"""Loss layers."""
+from torch import nn
+def reduce_loss(loss, reduction="mean"):
+    """Reduce the loss."""
+    if reduction == "mean" or reduction == "sum":
+        return getattr(loss, reduction)()
+    if reduction == "batch_mean":
+        return loss.sum().mul_(1.0 / loss.size(0))
+    return loss
+class BinaryFocalLoss(nn.Module):
+    """Binary focal loss."""
+    def __init__(self, alpha=0.25, reduction="none"):
+        super(BinaryFocalLoss, self).__init__()
+        self.alpha = alpha
+        self.reduction = reduction
+    def forward(self, input, target):
+        alpha, p = self.alpha, input.sigmoid()
+        neg_alpha, neg_target = 1.0 - alpha, 1.0 - target
+        alpha_weight = target.mul(alpha).add_(neg_target.mul(neg_alpha))
+        focal_weight = (1.0 - p).mul_(target).add_(p.mul(neg_target)).square()
+        loss = nn.functional.binary_cross_entropy_with_logits(
+            input, target, reduction="none"
+        )
+        return reduce_loss(loss * focal_weight.mul_(alpha_weight), self.reduction)
+class BinaryDiceLoss(nn.Module):
+    """Binary dice loss."""
+    def __init__(self, eps=1.0, reduction="none"):
+        super(BinaryDiceLoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+    def forward(self, input, target):
+        input = input.sigmoid()
+        num = input.mul(target).sum(-1).mul_(2).add_(self.eps)
+        den = input.add(target).sum(-1).add_(self.eps)
+        return reduce_loss(1.0 - num / den, self.reduction)
+class CrossEntropyLoss(nn.Module):
+    """Cross entropy loss with label smoothing."""
+    def __init__(self, epsilon=0, reduction="none"):
+        super(CrossEntropyLoss, self).__init__()
+        self.epsilon = epsilon
+        self.reduction = reduction
+    def forward_dense(self, input, target):
+        dim, target = input.shape[-1], target.squeeze_()
+        x = nn.functional.log_softmax(input, dim=-1)
+        y = nn.functional.one_hot(target, dim).float()
+        x = (
+            x.permute([0, x.dim() - 1] + list(range(x.dim()))[1:-1])
+            if x.dim() > 2
+            else x
+        )
+        y = (
+            y.permute([0, y.dim() - 1] + list(range(y.dim()))[1:-1])
+            if y.dim() > 2
+            else y
+        )
+        loss = nn.functional.cross_entropy(
+            x, y, reduction="none", label_smoothing=self.epsilon
+        )
+        return reduce_loss(loss, self.reduction)
+    def forward(self, input, target):
+        if self.epsilon > 0:
+            return self.forward_dense(input, target)
+        return nn.functional.cross_entropy(input, target, reduction=self.reduction)

longvu/multimodal_encoder/registry.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2023-present, BAAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------
+# pyre-unsafe
+"""Registry utilities."""
+import collections
+import functools
+class Registry(object):
+    """Registry class."""
+    def __init__(self, name):
+        self.name = name
+        self.registry = collections.OrderedDict()
+    def has(self, key):
+        return key in self.registry
+    def register(self, name, func=None, **kwargs):
+        def decorated(inner_function):
+            for key in name if isinstance(name, (tuple, list)) else [name]:
+                self.registry[key] = functools.partial(inner_function, **kwargs)
+            return inner_function
+        if func is not None:
+            return decorated(func)
+        return decorated
+    def get(self, name, default=None):
+        if name is None:
+            return None
+        if not self.has(name):
+            if default is not None:
+                return default
+            raise KeyError("`%s` is not registered in <%s>." % (name, self.name))
+        return self.registry[name]
+    def try_get(self, name):
+        if self.has(name):
+            return self.get(name)
+        return None

longvu/multimodal_encoder/siglip_encoder.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+import torch.nn.functional as F
+from transformers import SiglipImageProcessor, SiglipVisionConfig, SiglipVisionModel
+from .base_encoder import BaseVisionTower, ProcessorWrapper
+class SiglipVisionTower(BaseVisionTower):
+    def __init__(self, vision_tower_name, args, delay_load=False):
+        super(SiglipVisionTower, self).__init__(vision_tower_name, args, delay_load)
+        model_path = "./checkpoints/siglip-so400m-patch14-384"
+        base_model_name, res, interp = model_path, 384, 576
+        self.vision_tower_name = base_model_name
+        self._image_size = res if res is not None else 512
+        self._interp_size = interp
+        if not self.delay_load:
+            self.load_model()
+        elif self.unfreeze_mm_vision_tower:
+            self.load_model()
+        else:
+            self._hidden_size = 1152
+    def load_model(self, device_map=None):
+        self.vision_model = "siglip"
+        # clip_model, processor = create_model_from_pretrained(self.vision_tower_name)
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
+        # self.vision_tower = clip_model.visual.trunk
+        self.vision_tower.output_tokens = True
+        self._hidden_size = self.vision_tower.config.hidden_size
+        self._image_size = self.vision_tower.config.image_size
+        self._patch_size = self.vision_tower.config.patch_size
+        self.image_processor = SiglipImageProcessor.from_pretrained(
+            self.vision_tower_name
+        )
+        self.vision_tower.requires_grad_(self.unfreeze_mm_vision_tower)
+        self.is_loaded = True
+    def interpolate(self, image_features):
+        if self._interp_size is None:
+            return image_features
+        b, num_tokens, dim = image_features.shape
+        if num_tokens != self.num_patches:
+            target_h = target_w = int(self._interp_size**0.5)
+            h = w = int(num_tokens**0.5)
+            image_features = image_features.view(b, h, w, dim)
+            image_features = image_features.permute(0, 3, 1, 2).contiguous()
+            image_features = F.interpolate(
+                image_features.to(torch.float32),
+                size=(target_h, target_w),
+                mode="bilinear",
+                align_corners=False,
+            ).to(image_features.dtype)
+            # Permute the dimensions back to (b, target_h, target_w, dim)
+            image_features = image_features.permute(0, 2, 3, 1).contiguous()
+            # Flatten the spatial dimensions (target_h, target_w) into a single dimension
+            image_features = image_features.flatten(1, 2)
+        return image_features
+    def _forward(self, images, interpolate_token=576):
+        with torch.set_grad_enabled(self.unfreeze_mm_vision_tower):
+            image_features = self.vision_tower.forward(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+            ).hidden_states[-1]
+            interp_features = self.interpolate(image_features)
+            return interp_features

longvu/multimodal_encoder/utils.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2023-present, BAAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------
+# pyre-unsafe
+"""Layer utilities."""
+import cv2
+import numpy as np
+import torch
+def init_cross_conv(blocks):
+    """Initialize convolutional cross attention."""
+    for m in blocks.modules():
+        if isinstance(m, torch.nn.Conv2d):
+            torch.nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+    for blk in blocks:
+        torch.nn.init.constant_(blk.norm3.weight, 0)
+def set_dropout(module, dropout):
+    """Initialize dropout."""
+    for m in [m for m in module.modules() if isinstance(m, torch.nn.Dropout)]:
+        m.p = dropout
+def set_drop_path(blocks, drop_path):
+    """Initialize drop path."""
+    if not isinstance(blocks, torch.nn.ModuleList):
+        blocks = getattr(blocks, "blocks", getattr(blocks, "layers", None))
+    for i, blk in enumerate(blocks):
+        for m in [m for m in blk.modules() if type(m).__name__ == "DropPath"]:
+            m.p = i * drop_path / (len(blocks) - 1)
+def set_sync_batch_norm(module, ddp_group):
+    """Set data parallelism group for sync batch norm."""
+    for m in module.modules():
+        if isinstance(m, torch.nn.SyncBatchNorm):
+            m.process_group = ddp_group
+def resize_pos_embed(weight, out_len):
+    """Resize position embedding weights."""
+    out_h = out_w = int(out_len**0.5)
+    h = w = int(weight.shape[0] ** 0.5)
+    weight = weight.reshape((h, w, weight.shape[1]))
+    out_weight = [
+        cv2.resize(x, (out_w, out_h), interpolation=cv2.INTER_CUBIC)
+        for x in np.split(weight.astype("float32", copy=False), 4, axis=-1)
+    ]
+    out_weight = np.concatenate(out_weight, axis=-1)
+    return out_weight.reshape((-1, weight.shape[-1])).astype(weight.dtype, copy=False)

longvu/multimodal_projector/__pycache__/builder.cpython-310.pyc ADDED Viewed

Binary file (2.01 kB). View file

longvu/multimodal_projector/builder.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# pyre-unsafe
+import re
+import torch.nn as nn
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": "identity"}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, "mm_projector_type", "linear")
+    config.mm_hidden_size = 256
+    if projector_type == "linear":
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == "identity":
+        return IdentityMap()
+    raise ValueError(f"Unknown projector type: {projector_type}")

longvu/utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# pyre-unsafe
+from transformers import AutoConfig
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if "llava" in config and "llava" not in cfg.model_type:
+        assert cfg.model_type == "llama"
+        print(
+            "You are using newer LLaVA code base, while the checkpoint of v0 is from older code base."
+        )
+        print(
+            "You must upgrade the checkpoint to the new code base (this can be done automatically)."
+        )
+        confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = "LlavaLlamaForCausalLM"
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)

longvu/vision_sampler.py ADDED Viewed

	@@ -0,0 +1,566 @@

+import math
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class CrossAttention(nn.Module):
+    def __init__(self, q_dim, kv_dim, hidden_dim, num_heads, attention_bias=False):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.head_dim = self.hidden_dim // self.num_heads
+        if (self.head_dim * self.num_heads) != self.hidden_dim:
+            raise ValueError(
+                f"hidden_dim must be divisible by num_heads (got `hidden_dim`: {self.hidden_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Sequential(
+            nn.LayerNorm(q_dim),
+            nn.Linear(q_dim, self.num_heads * self.head_dim, bias=attention_bias),
+        )
+        self.k_proj = nn.Sequential(
+            nn.LayerNorm(kv_dim),
+            nn.Linear(kv_dim, self.num_heads * self.head_dim, bias=attention_bias),
+        )
+        self.v_proj = nn.Sequential(
+            nn.LayerNorm(kv_dim),
+            nn.Linear(kv_dim, self.num_heads * self.head_dim, bias=attention_bias),
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.head_dim, q_dim, bias=attention_bias
+        )
+    def forward(self, vision_latents, queries, attention_mask):
+        bsz, q_len, _ = queries.size()
+        bsz, v_len, _ = vision_latents.size()
+        query_states = self.q_proj(queries)
+        key_states = self.k_proj(vision_latents)
+        value_states = self.v_proj(vision_latents)
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, v_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, v_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, v_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, v_len)}, but is {attention_mask.size()}"
+                )
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class AggregationBlock(nn.Module):
+    def __init__(
+        self, attention, q_dim, kv_dim, hidden_dim, num_heads, attention_bias=False
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.head_dim = self.hidden_dim // self.num_heads
+        if (self.head_dim * self.num_heads) != self.hidden_dim:
+            raise ValueError(
+                f"hidden_dim must be divisible by num_heads (got `hidden_dim`: {self.hidden_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.attention = attention
+        if attention:
+            self.attention_layer = CrossAttention(
+                q_dim, kv_dim, hidden_dim, num_heads, attention_bias
+            )
+        else:
+            self.attention_layer = MLP(kv_dim, q_dim, q_dim)
+    def forward(self, vision_latents, queries, attention_mask):
+        if self.attention:
+            queries = self.attention_layer(vision_latents, queries, attention_mask)
+        else:
+            queries = self.attention_layer(vision_latents)
+        return queries
+class MultiKVCrossAttention(nn.Module):
+    def __init__(self, q_dim, kv_dim_list, hidden_dim, num_heads, attention_bias=False):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.head_dim = self.hidden_dim // self.num_heads
+        if (self.head_dim * self.num_heads) != self.hidden_dim:
+            raise ValueError(
+                f"hidden_dim must be divisible by num_heads (got `hidden_dim`: {self.hidden_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Sequential(
+            nn.LayerNorm(q_dim),
+            nn.Linear(q_dim, self.num_heads * self.head_dim, bias=attention_bias),
+        )
+        self.num_of_kvs = len(kv_dim_list)
+        for i, kv_dim in enumerate(kv_dim_list):
+            setattr(
+                self,
+                "k_proj_{}".format(i),
+                nn.Sequential(
+                    nn.LayerNorm(kv_dim),
+                    nn.Linear(
+                        kv_dim, self.num_heads * self.head_dim, bias=attention_bias
+                    ),
+                ),
+            )
+            setattr(
+                self,
+                "v_proj_{}".format(i),
+                nn.Sequential(
+                    nn.LayerNorm(kv_dim),
+                    nn.Linear(
+                        kv_dim, self.num_heads * self.head_dim, bias=attention_bias
+                    ),
+                ),
+            )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.head_dim, q_dim, bias=attention_bias
+        )
+    def forward(
+        self,
+        queries,
+        *vision_latents_attention_mask_list,
+    ):
+        vision_latents_list = vision_latents_attention_mask_list[: self.num_of_kvs]
+        attention_mask_list = vision_latents_attention_mask_list[self.num_of_kvs :]
+        bsz, q_len, _ = queries.size()
+        query_states = self.q_proj(queries)
+        key_states = torch.cat(
+            [
+                getattr(self, "k_proj_{}".format(i))(vision_latents_list[i])
+                for i in range(self.num_of_kvs)
+            ],
+            dim=1,
+        )
+        value_states = torch.cat(
+            [
+                getattr(self, "v_proj_{}".format(i))(vision_latents_list[i])
+                for i in range(self.num_of_kvs)
+            ],
+            dim=1,
+        )
+        v_len = key_states.shape[1]
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, v_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, v_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        # if kv_weight is not None:
+        #     kv_weight = kv_weight.unsqueeze(1).expand(-1, self.num_heads, -1, -1)
+        attention_mask = torch.cat(attention_mask_list, dim=-1)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, v_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, v_len)}, but is {attention_mask.size()}"
+                )
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+        )
+        # attn_output = spda(
+        #     query_states,
+        #     key_states,
+        #     value_states,
+        #     attn_mask=attention_mask,
+        #     additional_score=kv_weight
+        # )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class MLP(nn.Module):
+    def __init__(self, d_in, d_hidden, d_out):
+        super().__init__()
+        self.linear_1 = nn.Linear(d_in, d_hidden, bias=False)
+        self.act = nn.GELU()
+        self.linear_2 = nn.Linear(d_hidden, d_out, bias=False)
+    def forward(self, x):
+        return self.linear_2(self.act(self.linear_1(x)))
+class VisionCrossAttentionLayer(nn.Module):
+    def __init__(
+        self,
+        q_dim,
+        context_dim,
+        kv_dim_list,
+        kv_size_list,
+        hidden_dim=1024,
+        layer_idx=0,
+    ):
+        super().__init__()
+        num_heads = 16
+        self.num_of_kvs = len(kv_dim_list)
+        self.proj_context = nn.Linear(context_dim, hidden_dim, bias=False)
+        self.proj_in = nn.Linear(q_dim + hidden_dim, hidden_dim, bias=False)
+        # if self.num_of_kvs > 1:
+        #     self.weight_mlp = MLP(q_dim+hidden_dim, hidden_dim, self.num_of_kvs)
+        #     self.tower_weight = nn.Parameter(torch.zeros((self.num_of_kvs)))
+        self.proj_out = MLP(hidden_dim, hidden_dim, q_dim)
+        self.norm = nn.LayerNorm(hidden_dim)
+        self.cross_attn = MultiKVCrossAttention(
+            hidden_dim, kv_dim_list, hidden_dim, num_heads
+        )
+        self.kv_size_list = kv_size_list
+        for i, kv_size in enumerate(kv_size_list):
+            if kv_size > 1:
+                setattr(
+                    self,
+                    "pos_embed_{}".format(i),
+                    nn.Parameter(torch.randn(kv_size**2, hidden_dim)),
+                )
+                # self.register_buffer("pos_embed_{}".format(i), torch.from_numpy(get_2d_sincos_pos_embed(hidden_dim, kv_size)).float(), persistent=False)
+    def forward(
+        self,
+        queries,
+        context_feature,
+        *vision_latents_attention_mask_list,
+    ) -> torch.FloatTensor:
+        residual = queries
+        # queries = self.proj_in(queries)
+        context_feature = self.proj_context(context_feature)
+        # queries = queries + context_feature
+        queries = torch.cat([queries, context_feature], -1)
+        # if self.num_of_kvs > 1:
+        #     kv_weight = self.weight_mlp(queries) # B * 1 * num_tower
+        #     kv_weight = kv_weight + self.tower_weight.view(1, 1, -1)
+        #     kv_weight = kv_weight.softmax(-1)
+        #     kv_number_list = [size**2 for size in self.kv_size_list]
+        #     kv_weight = torch.repeat_interleave(kv_weight, torch.tensor(kv_number_list).to(kv_weight.device), dim=-1)
+        # else:
+        #     kv_weight = None
+        queries = self.proj_in(queries)
+        vision_latents_list = vision_latents_attention_mask_list[: self.num_of_kvs]
+        attention_mask_list = vision_latents_attention_mask_list[self.num_of_kvs :]
+        attention_mask_list_reshaped = []
+        if attention_mask_list is not None:
+            for attention_mask in attention_mask_list:
+                attention_mask = attention_mask.view(attention_mask.shape[0], 1, 1, -1)
+                attention_mask = attention_mask.expand(-1, -1, queries.shape[1], -1)
+                attention_mask_list_reshaped.append(attention_mask)
+        vision_latents_pos_list = []
+        for i, vision_latents in enumerate(vision_latents_list):
+            if vision_latents.shape[1] > 1:
+                vision_latents_pos_list.append(
+                    vision_latents
+                    + getattr(self, "pos_embed_{}".format(i))[None, :, :].to(
+                        vision_latents.dtype
+                    )
+                )
+            else:
+                vision_latents_pos_list.append(vision_latents)
+        # Cross Attention
+        attention_output = self.cross_attn(
+            queries, *vision_latents_pos_list, *attention_mask_list_reshaped
+        )
+        # attention_output = (attention_output * combination_weight).sum(2)
+        queries = queries + attention_output
+        queries = self.norm(queries)
+        queries = self.proj_out(queries)
+        queries = queries + residual
+        return queries
+class VisionAggregationLayer(nn.Module):
+    def __init__(
+        self,
+        q_dim,
+        context_dim,
+        kv_dim_list,
+        kv_size_list,
+        hidden_dim=1024,
+        layer_idx=0,
+    ):
+        super().__init__()
+        num_heads = 16
+        self.num_of_kvs = len(kv_dim_list)
+        self.proj_context = nn.Linear(context_dim, hidden_dim, bias=False)
+        self.proj_in = nn.Linear(q_dim + hidden_dim, hidden_dim, bias=False)
+        self.proj_out = MLP(hidden_dim, hidden_dim, q_dim)
+        self.norm = nn.LayerNorm(hidden_dim)
+        if self.num_of_kvs > 1:
+            self.weight_mlp = MLP(q_dim + hidden_dim, hidden_dim, self.num_of_kvs)
+        for i, kv_size in enumerate(kv_size_list):
+            if kv_size > 1:
+                setattr(
+                    self,
+                    "pos_embed_{}".format(i),
+                    nn.Parameter(torch.randn(kv_size**2, hidden_dim)),
+                )
+                setattr(
+                    self,
+                    "aggregate_{}".format(i),
+                    AggregationBlock(
+                        True, hidden_dim, kv_dim_list[i], hidden_dim, num_heads
+                    ),
+                )
+            else:
+                setattr(
+                    self,
+                    "aggregate_{}".format(i),
+                    AggregationBlock(
+                        False, hidden_dim, kv_dim_list[i], hidden_dim, num_heads
+                    ),
+                )
+    def forward(
+        self,
+        queries,
+        context_feature,
+        *vision_latents_attention_mask_list,
+    ) -> torch.FloatTensor:
+        residual = queries
+        # queries = self.proj_in(queries)
+        context_feature = self.proj_context(context_feature)
+        # queries = queries + context_feature
+        queries = torch.cat([queries, context_feature], -1)
+        if self.num_of_kvs > 1:
+            combination_weight = self.weight_mlp(queries).softmax(
+                -1
+            )  # B * 1 * num_tower
+            combination_weight = combination_weight.unsqueeze(-1)
+        else:
+            combination_weight = 1
+        queries = self.proj_in(queries)
+        vision_latents_list = vision_latents_attention_mask_list[: self.num_of_kvs]
+        attention_mask_list = vision_latents_attention_mask_list[self.num_of_kvs :]
+        attention_mask_list_reshaped = []
+        if attention_mask_list is not None:
+            for attention_mask in attention_mask_list:
+                attention_mask = attention_mask.view(attention_mask.shape[0], 1, 1, -1)
+                attention_mask = attention_mask.expand(-1, -1, queries.shape[1], -1)
+                attention_mask_list_reshaped.append(attention_mask)
+        vision_latents_pos_list = []
+        for i, vision_latents in enumerate(vision_latents_list):
+            if vision_latents.shape[1] > 1:
+                vision_latents_pos_list.append(
+                    vision_latents
+                    + getattr(self, "pos_embed_{}".format(i))[None, :, :].to(
+                        vision_latents.dtype
+                    )
+                )
+            else:
+                vision_latents_pos_list.append(vision_latents)
+        aggregated_vision_latents_list = []
+        for i, (vision_latents, attention_mask) in enumerate(
+            zip(vision_latents_pos_list, attention_mask_list_reshaped)
+        ):
+            aggregated_vision_latents_list.append(
+                getattr(self, "aggregate_{}".format(i))(
+                    vision_latents, queries, attention_mask
+                )
+            )
+        aggregated_vision_latents = torch.stack(aggregated_vision_latents_list, 2)
+        queries = queries + (aggregated_vision_latents * combination_weight).sum(2)
+        queries = self.norm(queries)
+        queries = self.proj_out(queries)
+        queries = queries + residual
+        return queries
+class VisionTokenSampler(nn.Module):
+    def __init__(
+        self,
+        q_dim,
+        context_dim,
+        kv_dim_list,
+        kv_size_list,
+        vision_hidden_size,
+        num_of_layers=1,
+        layer_type="joint",
+    ):
+        super().__init__()
+        assert layer_type in ["joint", "sep"]
+        if layer_type == "joint":
+            self.layers = nn.ModuleList(
+                [
+                    VisionCrossAttentionLayer(
+                        q_dim,
+                        context_dim,
+                        kv_dim_list,
+                        kv_size_list,
+                        vision_hidden_size,
+                        idx,
+                    )
+                    for idx in range(num_of_layers)
+                ]
+            )
+        else:
+            self.layers = nn.ModuleList(
+                [
+                    VisionAggregationLayer(
+                        q_dim,
+                        context_dim,
+                        kv_dim_list,
+                        kv_size_list,
+                        vision_hidden_size,
+                        idx,
+                    )
+                    for idx in range(num_of_layers)
+                ]
+            )
+    def forward(self, queries, context_feature, *vision_latents_attention_mask_list):
+        for layer in self.layers:
+            queries = layer(
+                queries, context_feature, *vision_latents_attention_mask_list
+            )
+        return queries

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+huggingface_hub==0.22.2
+torch==2.1.2
+numpy==1.26.4
+torchvision
+transformers==4.42.4
+tokenizers==0.15.2
+sentencepiece==0.1.99
+shortuuid
+accelerate==0.34.2
+peft==0.4.0
+bitsandbytes==0.41.0
+pydantic<2,>=1
+markdown2
+scikit-learn==1.2.2
+gradio==3.35.2
+gradio_client==0.2.9
+requests
+httpx==0.24.0
+uvicorn
+fastapi
+einops==0.6.1
+einops-exts==0.0.4
+timm==0.9.16
+decord
+ninja
+deepspeed==0.12.2
+protobuf
+iopath