File size: 4,442 Bytes
5d125d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# Copyright 2024 AniMemory Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
from safetensors.torch import load_file
from transformers import CLIPTextConfig, CLIPTextModelWithProjection
class AniMemoryAltCLip(torch.nn.Module):
def __init__(self, config: CLIPTextConfig):
super().__init__()
self.model_hf = CLIPTextModelWithProjection(config)
self.linear_proj = torch.nn.Linear(in_features=1280, out_features=1280)
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path,
subfolder="",
linear_proj_name="weights.safetensors",
torch_dtype=torch.float16,
):
cls.dtype = torch_dtype
config = CLIPTextModelWithProjection.config_class.from_pretrained(
pretrained_model_name_or_path, subfolder=subfolder
)
model = cls(config=config)
model.model_hf = CLIPTextModelWithProjection.from_pretrained(
pretrained_model_name_or_path, subfolder=subfolder
)
linear_proj_state = load_file(
os.path.join(pretrained_model_name_or_path, subfolder, linear_proj_name)
)
model.linear_proj.load_state_dict(linear_proj_state)
return model
def to(self, *args, **kwargs):
device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(
*args, **kwargs
)
super(AniMemoryAltCLip, self).to(*args, **kwargs)
self.dtype = dtype if dtype is not None else self.dtype
self.device = device if device is not None else self.device
return self
def expand_mask(self, mask=None, dtype="", tgt_len=None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
bsz, src_len = mask.size()
tgt_len = tgt_len if tgt_len is not None else src_len
expanded_mask = (
mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
)
inverted_mask = 1.0 - expanded_mask
return inverted_mask.masked_fill(
inverted_mask.to(torch.bool), torch.finfo(dtype).min
)
def make_attn_mask(self, attn_mask):
seq_len = attn_mask.shape[1]
query = attn_mask.unsqueeze(1).float()
attn_mask = (
query.repeat([1, seq_len, 1]).unsqueeze(1).repeat([1, self.num_head, 1, 1])
)
attn_mask = attn_mask.view([-1, seq_len, seq_len])
return attn_mask
def gradient_checkpointing_enable(
self,
):
self.model_hf.gradient_checkpointing_enable()
def forward(self, text, attention_mask):
hidden_states = self.model_hf.text_model.embeddings(
input_ids=text, position_ids=None
)
if attention_mask is None:
print("Warning: attention_mask is None in altclip!")
new_attn_mask = (
self.expand_mask(attention_mask, hidden_states.dtype)
if attention_mask is not None
else None
)
encoder_outputs = self.model_hf.text_model.encoder(
inputs_embeds=hidden_states,
attention_mask=new_attn_mask,
causal_attention_mask=None,
output_attentions=False,
output_hidden_states=True,
return_dict=True,
)
last_hidden_state = encoder_outputs[0]
last_hidden_state = self.model_hf.text_model.final_layer_norm(last_hidden_state)
last_hidden_state = (
last_hidden_state[torch.arange(last_hidden_state.shape[0]), 0]
@ self.model_hf.text_projection.weight
)
pooled_output = self.linear_proj(last_hidden_state)
extra_features = encoder_outputs.hidden_states[-2]
extra_features = self.model_hf.text_model.final_layer_norm(extra_features)
return extra_features, pooled_output
|