|
import io |
|
import os |
|
import warnings |
|
import logging |
|
import torch |
|
import torch.utils.checkpoint |
|
from torch import nn |
|
from torch.nn import MSELoss |
|
|
|
from torch.cuda.amp import autocast as autocast |
|
|
|
from modeling_internvideo2_vit import pretrain_internvideo2_giant_patch14_224_clean |
|
from modeling_qformer import build_qformer |
|
from model_config import VideoChat2Config |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
from transformers import LlamaTokenizer,AutoTokenizer,AutoModel,AutoModelForCausalLM,AutoProcessor |
|
from transformers import AutoConfig, PreTrainedModel |
|
|
|
try: |
|
token = os.environ['HF_TOKEN'] |
|
except: |
|
warnings.warn("The HF_TOKEN was not found in the system variables. Please ensure that it is filled out correctly and that you have requested access to the model. If you haven't applied, please visit https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3 to request access.") |
|
token=None |
|
|
|
def disabled_train(self, mode=True): |
|
"""Overwrite model.train with this function to make sure train/eval mode |
|
does not change anymore.""" |
|
return self |
|
|
|
|
|
def freeze_module(module): |
|
for _, param in module.named_parameters(): |
|
param.requires_grad = False |
|
module = module.eval() |
|
module.train = disabled_train |
|
return module |
|
|
|
|
|
class BaseMLLM(PreTrainedModel): |
|
config_class = VideoChat2Config |
|
def __init__(self, config): |
|
self.model_config = config.model_config |
|
super().__init__(config) |
|
self.build_vision_encoder() |
|
if 'llm' in self.model_config: self.build_llm() |
|
if 'bridge' in self.model_config: self.build_bridge() |
|
if 'loss' in self.model_config: self.build_loss() |
|
|
|
for n, p in self.named_parameters(): |
|
if p.requires_grad: |
|
logger.info(f'{n} requires_grad') |
|
|
|
|
|
def build_vision_encoder(self): |
|
|
|
|
|
if 'internvideo2' in self.model_config.vision_encoder.name.lower(): |
|
encoder_name = self.model_config.vision_encoder.name |
|
logger.info(f"Build vision_encoder: {encoder_name}") |
|
if encoder_name == 'internvideo2-1B': |
|
self.vision_encoder = pretrain_internvideo2_giant_patch14_224_clean(self.model_config) |
|
else: |
|
raise ValueError(f"Not implemented: {encoder_name}") |
|
else: |
|
raise NotImplementedError(self.model_config.vision_encoder.name) |
|
|
|
if self.model_config.vision_encoder.vit_add_ln: |
|
self.vision_layernorm = nn.LayerNorm(self.model_config.vision_encoder.encoder_embed_dim, eps=1e-12) |
|
else: |
|
self.vision_layernorm = nn.Identity() |
|
|
|
self.freeze_vision_encoder = self.model_config.get("freeze_vision_encoder", False) |
|
|
|
if self.freeze_vision_encoder: |
|
logger.info("freeze vision encoder") |
|
freeze_module(self.vision_encoder) |
|
freeze_module(self.vision_layernorm) |
|
|
|
|
|
def build_bridge(self): |
|
|
|
self.project_up = nn.Linear(768, self.lm.config.hidden_size) |
|
|
|
self.project_down = nn.Linear(self.lm.config.hidden_size, 768) |
|
|
|
if 'qformer' in self.model_config.bridge.name.lower(): |
|
from transformers import BertTokenizer |
|
self.qformer_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="left") |
|
self.qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"}) |
|
self.qformer_tokenizer.padding_side = "left" |
|
if self.model_config.bridge.name == 'qformer': |
|
self.qformer, self.query_tokens = build_qformer( |
|
self.model_config.bridge.num_query_token, self.model_config.vision_encoder.encoder_embed_dim, |
|
qformer_hidden_dropout_prob=self.model_config.bridge.qformer_hidden_dropout_prob, |
|
qformer_attention_probs_dropout_prob=self.model_config.bridge.qformer_attention_probs_dropout_prob, |
|
qformer_drop_path_rate=self.model_config.bridge.qformer_drop_path_rate, |
|
) |
|
self.qformer.resize_token_embeddings(len(self.qformer_tokenizer)) |
|
self.qformer.cls = None |
|
self.extra_num_query_token = self.model_config.bridge.extra_num_query_token |
|
if self.model_config.bridge.extra_num_query_token > 0: |
|
logger.info(f"Add extra {self.model_config.bridge.extra_num_query_token} tokens in QFormer") |
|
self.extra_query_tokens = nn.Parameter( |
|
torch.zeros(1, self.model_config.bridge.extra_num_query_token, self.query_tokens.shape[-1]) |
|
) |
|
|
|
self.freeze_bridge = self.model_config.get("freeze_bridge", False) |
|
if self.freeze_bridge: |
|
logger.info("freeze bridge") |
|
freeze_module(self.qformer) |
|
self.query_tokens.requires_grad = False |
|
|
|
def build_llm(self): |
|
self.lm_name = self.model_config.llm.name |
|
if self.model_config.llm.name == 'mistral_7b': |
|
from transformers import AutoModelForCausalLM |
|
config = AutoConfig.from_pretrained( |
|
self.model_config.llm.pretrained_llm_path, |
|
torch_dtype=torch.bfloat16, |
|
token=token, |
|
|
|
) |
|
self.lm = AutoModelForCausalLM.from_config(config) |
|
elif self.model_config.llm.name == 'internlm_20b': |
|
from transformers import AutoModelForCausalLM |
|
self.lm = AutoModelForCausalLM.from_pretrained( |
|
self.model_config.llm.pretrained_llm_path, |
|
torch_dtype=torch.bfloat16, |
|
trust_remote_code=True, |
|
) |
|
self.lm.gradient_checkpointing = True |
|
self.lm._set_gradient_checkpointing() |
|
elif self.model_config.llm.name == 'internlm2_5_7b': |
|
from transformers import AutoModelForCausalLM |
|
self.lm = AutoModelForCausalLM.from_pretrained( |
|
self.model_config.llm.pretrained_llm_path, |
|
torch_dtype=torch.bfloat16, |
|
trust_remote_code=True, |
|
local_files_only=True, |
|
) |
|
else: |
|
raise NotImplementedError(self.model_config.llm.name) |
|
|
|
self.freeze_llm = self.model_config.get("freeze_llm", True) |
|
logger.info(f'freeze_llm: {self.freeze_llm}') |
|
if self.freeze_llm: |
|
logger.info("freeze llm") |
|
freeze_module(self.lm) |
|
|
|
if self.model_config.llm.use_lora: |
|
self.use_lora = True |
|
from peft import get_peft_model, LoraConfig, TaskType |
|
logger.info("Use lora") |
|
if self.model_config.llm.name == 'internlm_20b': |
|
peft_config = LoraConfig( |
|
task_type=TaskType.CAUSAL_LM, inference_mode=False, |
|
r=self.model_config.llm.lora_r, lora_alpha=self.model_config.llm.lora_alpha, lora_dropout=self.model_config.llm.lora_dropout, |
|
target_modules=['wqkv', 'wo', 'w1', 'w2', 'w3', 'output'] |
|
) |
|
else: |
|
peft_config = LoraConfig( |
|
task_type=TaskType.CAUSAL_LM, inference_mode=False, |
|
r=self.model_config.llm.lora_r, lora_alpha=self.model_config.llm.lora_alpha, lora_dropout=self.model_config.llm.lora_dropout, |
|
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", |
|
"gate_proj", "up_proj", "down_proj", "lm_head"] |
|
) |
|
|
|
self.lm = get_peft_model(self.lm, peft_config) |
|
self.lm.enable_input_require_grads() |
|
self.lm.print_trainable_parameters() |
|
else: |
|
self.use_lora = False |
|
|
|
|
|
def build_loss(self): |
|
self.use_vision_regression_loss = self.model_config.loss.get("use_vision_regression_loss", False) |
|
if self.use_vision_regression_loss: |
|
self.image_loss_fct = MSELoss() |
|
|
|
@property |
|
def dtype(self): |
|
return self.lm.dtype |
|
|
|
|
|
@property |
|
def device(self): |
|
return self.lm.device |
|
|