""" Copyright (c) 2023, salesforce.com, inc. All rights reserved. SPDX-License-Identifier: BSD-3-Clause For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause """ import logging import torch import torch.distributed as dist import torch.nn as nn from typing import Optional, Tuple, List from torch.cuda.amp import autocast as autocast from torch.nn import functional as F from lavis.common.registry import registry from lavis.models.base_model import all_gather_with_grad, concat_all_gather from lavis.models.blip2_models.blip2 import ( compute_sim_matrix, disabled_train, ) from lavis.models.blip_models.blip_outputs import BlipOutput from transformers.modeling_outputs import ModelOutput from models.q_formers.blip2 import Blip2Base from models.q_formers.position_encoding import PositionEmbeddings from ldm.modules.diffusionmodules.util import conv_nd import time class BlipOutputFeatures(ModelOutput): """ Data class of features from BlipFeatureExtractor. Args: image_embeds: (torch.FloatTensor) of shape (batch_size, num_patches+1, embed_dim), optional image_features: (torch.FloatTensor) of shape (batch_size, num_patches+1, feature_dim), optional text_embeds: (torch.FloatTensor) of shape (batch_size, sequence_length+1, embed_dim), optional text_features: (torch.FloatTensor) of shape (batch_size, sequence_length+1, feature_dim), optional The first embedding or feature is for the [CLS] token. Features are obtained by projecting the corresponding embedding into a normalized low-dimensional space. """ image_embeds: Optional[torch.FloatTensor] = None image_embeds_proj: Optional[torch.FloatTensor] = None text_embeds: Optional[torch.FloatTensor] = None text_embeds_proj: Optional[torch.FloatTensor] = None multimodal_embeds: Optional[torch.FloatTensor] = None hidden_states: List[torch.FloatTensor] = None attentions: List[torch.FloatTensor] = None cross_attentions: List[torch.FloatTensor] = None class Blip2Qformer(Blip2Base): """ BLIP2 first-stage model with Q-former and ViT. Supported model types: - pretrained: pretrained model with vit-g - pretrain_vitL: pretrained model with vit-large - coco: fintuned model on coco Usage: >>> from lavis.models import load_model >>> model = load_model("blip2", "pretrain") """ PRETRAINED_MODEL_CONFIG_DICT = { "pretrain": "configs/models/blip2/blip2_pretrain.yaml", "pretrain_vitL": "configs/models/blip2/blip2_pretrain_vitL.yaml", "coco": "configs/models/blip2/blip2_coco.yaml", } def __init__( self, model_name="bert-base-uncased", vit_model="eva_clip_g", img_size=224, drop_path_rate=0, head_dropout=0, use_grad_checkpoint=False, vit_precision="fp16", freeze_vit=True, num_query_token=32, cross_attention_freq=2, embed_dim=256, max_txt_len=32, query_token_init_type='normal', max_position_embeddings=512, multilevels=[], ): super().__init__() self.num_query_token = num_query_token self.tokenizer = self.init_tokenizer(model_name) self.visual_encoder, self.ln_vision = self.init_vision_encoder( vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision, len(multilevels), ) self.multilevels = multilevels self.crossattn_embeddings = PositionEmbeddings(max_position_embeddings, self.visual_encoder.num_features) self.Qformer, self.query_tokens = self.init_Qformer( num_query_token, self.visual_encoder.num_features, model_name, head_dropout, cross_attention_freq, query_token_init_type, ) self.Qformer.resize_token_embeddings(len(self.tokenizer)) state_dict = self.Qformer.state_dict() for name, param in self.Qformer.named_parameters(): if "_query" in name: key_orig = name.replace("_query", "") param.data.copy_(state_dict[key_orig]) self.vision_proj = nn.Linear(self.Qformer.config.hidden_size, embed_dim) self.text_proj = nn.Linear(self.Qformer.config.hidden_size, embed_dim) self.itm_head = nn.Linear(self.Qformer.config.hidden_size, 2) self.temp = nn.Parameter(0.07 * torch.ones([])) self.max_txt_len = max_txt_len self.visual_encoder.requires_grad_(False) for name, param in self.Qformer.named_parameters(): if 'crossattention' in name: param.requires_grad = True else: param.requires_grad = False del self.Qformer.cls del self.vision_proj del self.text_proj del self.itm_head del self.temp def forward(self, samples): image = samples["image"] text = samples["text_input"] image_embeds = self.ln_vision(self.visual_encoder(image)) image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( image.device ) query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) query_output = self.Qformer.bert( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, use_cache=True, return_dict=True, ) image_feats = F.normalize( self.vision_proj(query_output.last_hidden_state), dim=-1 ) text_tokens = self.tokenizer( text, padding="max_length", truncation=True, max_length=self.max_txt_len, return_tensors="pt", ).to(image.device) text_output = self.Qformer.bert( text_tokens.input_ids, attention_mask=text_tokens.attention_mask, return_dict=True, ) text_feat = F.normalize( self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1 ) ###============== Image-text Contrastive ===================### image_feats_all = concat_all_gather( image_feats ) # [batch_size*num_gpu, num_query_tokens, embed_dim] text_feat_all = concat_all_gather(text_feat) # [batch_size*num_gpu, embed_dim] sim_q2t = torch.matmul( image_feats.unsqueeze(1), text_feat_all.unsqueeze(-1) ).squeeze() # [batch_size, batch_size*num_gpu, num_query_tokens] # image-text similarity: aggregate across all query tokens sim_i2t, _ = sim_q2t.max(-1) sim_i2t = sim_i2t / self.temp # text-query similarity: [batch_size, batch_size*num_gpu, num_query_tokens] sim_t2q = torch.matmul( text_feat.unsqueeze(1).unsqueeze(1), image_feats_all.permute(0, 2, 1) ).squeeze() # text-image similarity: aggregate across all query tokens sim_t2i, _ = sim_t2q.max(-1) sim_t2i = sim_t2i / self.temp # [batch_size, batch_size*num_gpu] rank = dist.get_rank() bs = image.size(0) targets = torch.linspace(rank * bs, rank * bs + bs - 1, bs, dtype=int).to( image.device ) if "image_id" in samples.keys(): #coco retrieval finetuning image_ids = samples["image_id"].view(-1,1) image_ids_all = concat_all_gather(image_ids) pos_idx = torch.eq(image_ids, image_ids_all.t()).float() sim_targets = pos_idx / pos_idx.sum(1,keepdim=True) sim_targets = 0.9 * sim_targets + 0.1 * torch.ones_like(sim_targets) / sim_targets.size(1) loss_t2i = -torch.sum(F.log_softmax(sim_t2i, dim=1)*sim_targets,dim=1).mean() loss_i2t = -torch.sum(F.log_softmax(sim_i2t, dim=1)*sim_targets,dim=1).mean() loss_itc = (loss_t2i+loss_i2t)/2 else: loss_itc = ( F.cross_entropy(sim_i2t, targets, label_smoothing=0.1) + F.cross_entropy(sim_t2i, targets, label_smoothing=0.1) ) / 2 ###============== Image-text Matching ===================### text_input_ids_world = concat_all_gather(text_tokens.input_ids) text_attention_mask_world = concat_all_gather(text_tokens.attention_mask) image_embeds_world = all_gather_with_grad(image_embeds) with torch.no_grad(): if "image_id" in samples.keys(): mask = torch.eq(image_ids, image_ids_all.t()) sim_t2i.masked_fill_(mask, -10000) sim_i2t.masked_fill_(mask, -10000) else: sim_t2i[:, rank * bs : rank * bs + bs].fill_diagonal_(-10000) sim_i2t[:, rank * bs : rank * bs + bs].fill_diagonal_(-10000) weights_t2i = F.softmax(sim_t2i, dim=1) weights_i2t = F.softmax(sim_i2t, dim=1) # select a negative image for each text image_embeds_neg = [] for b in range(bs): neg_idx = torch.multinomial(weights_t2i[b], 1).item() image_embeds_neg.append(image_embeds_world[neg_idx]) image_embeds_neg = torch.stack(image_embeds_neg, dim=0) # select a negative text for each image text_ids_neg = [] text_atts_neg = [] for b in range(bs): neg_idx = torch.multinomial(weights_i2t[b], 1).item() text_ids_neg.append(text_input_ids_world[neg_idx]) text_atts_neg.append(text_attention_mask_world[neg_idx]) text_ids_neg = torch.stack(text_ids_neg, dim=0) text_atts_neg = torch.stack(text_atts_neg, dim=0) text_ids_all = torch.cat( [text_tokens.input_ids, text_tokens.input_ids, text_ids_neg], dim=0 ) # pos, pos, neg text_atts_all = torch.cat( [text_tokens.attention_mask, text_tokens.attention_mask, text_atts_neg], dim=0, ) query_tokens_itm = self.query_tokens.expand(text_ids_all.shape[0], -1, -1) query_atts_itm = torch.ones(query_tokens_itm.size()[:-1], dtype=torch.long).to( image.device ) attention_mask_all = torch.cat([query_atts_itm, text_atts_all], dim=1) image_embeds_all = torch.cat( [image_embeds, image_embeds_neg, image_embeds], dim=0 ) # pos, neg, pos image_atts_all = torch.ones(image_embeds_all.size()[:-1], dtype=torch.long).to( image.device ) output_itm = self.Qformer.bert( text_ids_all, query_embeds=query_tokens_itm, attention_mask=attention_mask_all, encoder_hidden_states=image_embeds_all, encoder_attention_mask=image_atts_all, return_dict=True, ) vl_embeddings = output_itm.last_hidden_state[:, : query_tokens_itm.size(1), :] vl_output = self.itm_head(vl_embeddings) logits = vl_output.mean(dim=1) itm_labels = torch.cat( [torch.ones(bs, dtype=torch.long), torch.zeros(2 * bs, dtype=torch.long)], dim=0, ).to(image.device) loss_itm = F.cross_entropy(logits, itm_labels) ##================= Image Captioning ========================## decoder_input_ids = text_tokens.input_ids.clone() decoder_input_ids[:, 0] = self.tokenizer.bos_token_id labels = decoder_input_ids.masked_fill( decoder_input_ids == self.tokenizer.pad_token_id, -100 ) query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to( image.device ) attention_mask = torch.cat([query_atts, text_tokens.attention_mask], dim=1) lm_output = self.Qformer( decoder_input_ids, attention_mask=attention_mask, past_key_values=query_output.past_key_values, return_dict=True, labels=labels, ) loss_lm = lm_output.loss return BlipOutput( loss=loss_itc + loss_itm + loss_lm, loss_itc=loss_itc, loss_itm=loss_itm, loss_lm=loss_lm, ) @torch.no_grad() def generate( self, samples, use_nucleus_sampling=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0, ): """ Args: samples (dict): A dictionary containing the following keys: - image (torch.Tensor): A tensor of shape (batch_size, 3, H, W) use_nucleus_sampling (bool): Whether to use nucleus sampling. If False, use top-k sampling. num_beams (int): Number of beams for beam search. 1 means no beam search. max_length (int): The maximum length of the sequence to be generated. min_length (int): The minimum length of the sequence to be generated. top_p (float): The cumulative probability for nucleus sampling. repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. num_captions (int): Number of captions to be generated for each image. Returns: captions (list): A list of strings of length batch_size * num_captions. """ image = samples["image"] image_embeds = self.ln_vision(self.visual_encoder(image)) if not use_nucleus_sampling: image_embeds = image_embeds.repeat_interleave(num_beams, dim=0) else: num_beams = 1 image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( image.device ) model_kwargs = { "encoder_hidden_states": image_embeds, "encoder_attention_mask": image_atts, } input_ids = ( torch.LongTensor(image.size(0), 1) .fill_(self.tokenizer.bos_token_id) .to(image.device) ) query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) outputs = self.Qformer.generate( input_ids=input_ids, query_embeds=query_tokens, max_length=max_length, min_length=min_length, num_beams=num_beams, do_sample=use_nucleus_sampling, top_p=top_p, eos_token_id=self.tokenizer.sep_token_id, pad_token_id=self.tokenizer.pad_token_id, **model_kwargs ) captions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) return captions def forward_visual_encoder(self, image): with torch.no_grad(): with self.maybe_autocast(): image_embeds_frozen = self.visual_encoder(image, output_hidden_states=True) image_embeds_frozen = [ln(image_embeds_frozen[lvl]) for lvl, ln in zip(self.multilevels, self.ln_vision)] image_embeds_frozen = [image_embed.float() for image_embed in image_embeds_frozen] image_atts = [torch.ones( image_embed.size()[:-1], dtype=torch.long ).to(self.device) for image_embed in image_embeds_frozen] return image_embeds_frozen, image_atts def forward_qformer(self, caption, image_embeds_frozen, image_atts, output_hidden_states=False): query_tokens = self.query_tokens.expand( image_embeds_frozen.shape[0], -1, -1 ) query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to( self.device ) text = self.tokenizer(caption, return_tensors="pt", padding=True, truncation=True).to( self.device ) attention_mask = torch.cat([query_atts, text.attention_mask], dim=1) query_pos_embeds = self.query_tokens.repeat(image_embeds_frozen.shape[0], 1, 1) output = self.Qformer.bert( text.input_ids, query_embeds=query_tokens, attention_mask=attention_mask, encoder_hidden_states=image_embeds_frozen, encoder_attention_mask=image_atts, query_pos_embeds=query_pos_embeds, output_hidden_states=output_hidden_states, return_dict=True, ) hidden_states = [feat[:, : query_tokens.size(1), :] for feat in output.hidden_states] return hidden_states def forward_qformer(self, caption, image_embeds_frozen, image_atts): bs = image_embeds_frozen[0].shape[0] query_tokens = self.query_tokens.expand(bs, -1, -1) query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(self.device) text = self.tokenizer(['']*len(caption), return_tensors="pt", padding=True, truncation=True, max_length=512).to( self.device ) attention_mask = torch.cat([query_atts, text.attention_mask], dim=1) query_pos_embeds = self.query_tokens.repeat(bs, 1, 1) output = self.Qformer.bert( text.input_ids, query_embeds=query_tokens, attention_mask=attention_mask, encoder_hidden_states=image_embeds_frozen, encoder_attention_mask=image_atts, query_pos_embeds=query_pos_embeds, output_hidden_states=True, return_dict=True, ) hidden_states = [feat[:, : query_tokens.size(1), :] for feat in output.hidden_states] return hidden_states def forward_image(self, image): image_embeds = self.ln_vision(self.visual_encoder(image)) image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to( image.device ) query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) query_output = self.Qformer.bert( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=True, ) return query_output.last_hidden_state, image_embeds def forward_text(self, text_tokens): text_output = self.Qformer.bert( text_tokens.input_ids, attention_mask=text_tokens.attention_mask, return_dict=True, ) return text_output.last_hidden_state[:, 0, :] def compute_itm(self, image_inputs, text_ids, text_atts): image_atts = torch.ones(image_inputs.size()[:-1], dtype=torch.long).to( image_inputs.device ) query_tokens = self.query_tokens.expand(image_inputs.shape[0], -1, -1) query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to( image_inputs.device ) attention_mask = torch.cat([query_atts, text_atts], dim=1) output_itm = self.Qformer.bert( text_ids, query_embeds=query_tokens, attention_mask=attention_mask, encoder_hidden_states=image_inputs, encoder_attention_mask=image_atts, return_dict=True, ) vl_embeddings = output_itm.last_hidden_state[:, : query_tokens.size(1), :] itm_logit = self.itm_head(vl_embeddings) itm_logit = itm_logit[:, :, 1].mean(dim=1) return itm_logit @torch.no_grad() def extract_features(self, samples, mode="multimodal"): """ Extract features for multimodal or unimodal samples. Args: samples (dict): A dictionary of samples, containing the following keys: - image (torch.Tensor): A tensor of shape (B, C, H, W) containing the image. Raw images should be preprocessed before being passed to feature extractor. - text_input (list): A list of strings containing the text, length B. mode (str): The mode of feature extraction. Can be either "multimodal", "text" or "image". If "multimodal", return image features and multimodal features; if "text", return text features; if "image", return image features. Default: "multimodal". Returns: BlipOutputFeatures: A BlipOutputFeatures object containing the features. See lavis/models/blip_models/blip_outputs.py for more details. """ image = samples.get("image") caption = samples.get("text_input") # assert mode is one of "image", "text", "multimodal" assert mode in [ "image", "text", "multimodal", ], "mode must be one of 'image', 'text', 'multimodal'" # initalize output image_embeds, text_embeds, multimodal_embeds = None, None, None image_features, text_features = None, None if mode == "image": assert ( image is not None ), "Image is not provided for mode 'image' or 'multimodal'" # return query features with self.maybe_autocast(): image_embeds_frozen = self.ln_vision(self.visual_encoder(image)) image_embeds_frozen = image_embeds_frozen.float() image_atts = torch.ones( image_embeds_frozen.size()[:-1], dtype=torch.long ).to(self.device) query_tokens = self.query_tokens.expand( image_embeds_frozen.shape[0], -1, -1 ) query_output = self.Qformer.bert( query_embeds=query_tokens, encoder_hidden_states=image_embeds_frozen, encoder_attention_mask=image_atts, return_dict=True, ) image_embeds = query_output.last_hidden_state image_features = F.normalize(self.vision_proj(image_embeds), dim=-1) elif mode == "text": assert ( caption is not None ), "text input is None for mode 'text' or 'multimodal'" # return text features text = self.tokenizer(caption, return_tensors="pt", padding=True).to( self.device ) text_output = self.Qformer.bert( text.input_ids, attention_mask=text.attention_mask, return_dict=True, ) text_embeds = text_output.last_hidden_state text_features = self.text_proj(text_embeds) text_features = F.normalize(text_features, dim=-1) elif mode == "multimodal": # return multimodel query features with self.maybe_autocast(): image_embeds_frozen = self.ln_vision(self.visual_encoder(image)) image_embeds_frozen = image_embeds_frozen.float() image_atts = torch.ones( image_embeds_frozen.size()[:-1], dtype=torch.long ).to(self.device) query_tokens = self.query_tokens.expand( image_embeds_frozen.shape[0], -1, -1 ) query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to( self.device ) text = self.tokenizer(caption, return_tensors="pt", padding=True).to( self.device ) attention_mask = torch.cat([query_atts, text.attention_mask], dim=1) output = self.Qformer.bert( text.input_ids, query_embeds=query_tokens, attention_mask=attention_mask, encoder_hidden_states=image_embeds_frozen, encoder_attention_mask=image_atts, return_dict=True, ) multimodal_embeds = output.last_hidden_state[:, : query_tokens.size(1), :] return BlipOutputFeatures( image_embeds=image_embeds, image_embeds_proj=image_features, text_embeds=text_embeds, text_embeds_proj=text_features, multimodal_embeds=multimodal_embeds, ) @classmethod def from_config(cls, cfg): vit_model = cfg.get("vit_model", "eva_clip_g") img_size = cfg.get("image_size") num_query_token = cfg.get("num_query_token") cross_attention_freq = cfg.get("cross_attention_freq", 2) drop_path_rate = cfg.get("drop_path_rate", 0) use_grad_checkpoint = cfg.get("use_grad_checkpoint", False) vit_precision = cfg.get("vit_precision", "fp16") freeze_vit = cfg.get("freeze_vit", True) max_txt_len = cfg.get("max_txt_len", 32) model = cls( vit_model=vit_model, img_size=img_size, drop_path_rate=drop_path_rate, use_grad_checkpoint=use_grad_checkpoint, vit_precision=vit_precision, freeze_vit=freeze_vit, num_query_token=num_query_token, cross_attention_freq=cross_attention_freq, max_txt_len=max_txt_len, ) model.load_checkpoint_from_config(cfg) return model def compute_sim_matrix(self, data_loader, task_cfg): """ Compute similarity i2t, t2i matrix for the given data loader. """ k_test = task_cfg.k_test return compute_sim_matrix(model=self, data_loader=data_loader, k_test=k_test)