Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

text-encoder/config.json +31 -0
text-encoder/model.safetensors +3 -0
text-encoder/utils.py +332 -0
utils.py +332 -0
vision-encoder/config.json +16 -0
vision-encoder/model.safetensors +3 -0
vision-encoder/utils.py +332 -0

text-encoder/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "CLIPTextEncoderOnly"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "utils.CLIPTextEncoderOnlyConfig",
+    "AutoModel": "utils.CLIPTextEncoderOnly"
+  },
+  "bos_token_id": 49406,
+  "eos_token_id": 49407,
+  "frozen": false,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 512,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "layer_norm_eps": 1e-05,
+  "lora": null,
+  "max_position_embeddings": 77,
+  "model_name": "openai/clip-vit-base-patch32",
+  "model_type": "clip_custom_text_model",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "pretrained": false,
+  "projection_dim": 512,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.1",
+  "vocab_size": 49408
+}

text-encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:560e169fa2b2aae50f7b22ddb7aeccea7035e2d0230af5a897db364dbd8fa7f3
+size 253736912

text-encoder/utils.py ADDED Viewed

	@@ -0,0 +1,332 @@

+from transformers import AutoConfig, AutoModel, PretrainedConfig, CLIPTextConfig, CLIPVisionConfig, PreTrainedModel, CLIPTextModelWithProjection, CLIPVisionModelWithProjection
+from transformers.utils import ModelOutput
+import torch
+import open_clip
+from dataclasses import dataclass
+import safetensors.torch
+from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
+import os
+HF_SAFE_WEIGHTS_NAME = "open_clip_model.safetensors"
+HF_SAFE_WEIGHTS_NAME_PRIOR = "prior_model.safetensors"
+@dataclass
+class PriorTransformerOutput(ModelOutput):
+    """
+    The output of [`PriorTransformer`].
+    Args:
+        predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
+            The predicted CLIP image embedding conditioned on the CLIP text embedding input.
+    """
+    predicted_image_embedding: torch.FloatTensor
+@dataclass
+class TextEncoderOutput(ModelOutput):
+    """
+    Output class for CLIPTextEncoderOnly model to store the outputs in a Hugging Face transformer style.
+    Attributes:
+        prompt_embeds (torch.Tensor): The embeddings of the input prompts.
+        last_hidden_states (torch.Tensor): The last hidden states from the model.
+    """
+    text_embeds: torch.FloatTensor = None
+    last_hidden_state: torch.FloatTensor = None
+class CLIPTextEncoderOnlyConfig(CLIPTextConfig):
+    model_type = "clip_custom_text_model"
+    def __init__(self, model_name: str = None, pretrained: bool = True, frozen: bool = False, lora: dict = None, **kwargs):
+        self.model_name = model_name
+        self.pretrained = pretrained
+        self.frozen = frozen
+        self.lora = lora
+        super().__init__(**kwargs)
+class CLIPTextEncoderOnly(PreTrainedModel):
+    config_class = CLIPTextEncoderOnlyConfig
+    def __init__(self, config):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        super().__init__(config)
+        if config.pretrained:
+            self.model = CLIPTextModelWithProjection.from_pretrained(config.model_name)
+        else:
+            base_cfg = CLIPTextConfig.from_pretrained(config.model_name)
+            self.model = CLIPTextModelWithProjection(base_cfg)
+        if config.lora:
+            l_config = LoraConfig(
+                r=config.lora.lora_r,
+                lora_alpha=config.lora.lora_alpha,
+                target_modules=[
+                    "k_proj",
+                    "v_proj",
+                    "q_proj",
+                    "out_proj",
+                    "fc1",
+                    "fc2",
+                    "visual_projection",
+                    "text_projection"
+                ],
+                lora_dropout=config.lora.lora_dropout,
+                bias="lora_only",
+            )
+            self.model = get_peft_model(self.model, l_config)
+    def forward(self, input_ids, attention_mask=None, position_ids=None):
+        """
+        Forward pass of the model.
+        :param input_ids: Indices of input sequence tokens in the vocabulary.
+        :param attention_mask: Mask to avoid performing attention on padding token indices.
+        :param token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        :return: Outputs of the model.
+        """
+        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_hidden_states=True)
+        return TextEncoderOutput(text_embeds=outputs.text_embeds, last_hidden_state=outputs.last_hidden_state)
+class CustomTextEncoderOnly(PreTrainedModel):
+    def __init__(self, model_name: str, output_hidden_size: int, pretrained: bool = True, frozen: bool = True, last_hidden_state: bool = False, lora: dict = None):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        config = AutoModel.from_pretrained(model_name).config
+        super().__init__(config)
+        self.last_hidden_state = last_hidden_state
+        if pretrained:
+            self.model = AutoModel.from_pretrained(model_name)
+            if frozen:
+                for param in self.model.parameters():
+                    param.requires_grad = False
+        else:
+            self.model = AutoModel(config)
+        self.fc1 = torch.nn.Linear(self.model.config.hidden_size, output_hidden_size)
+        if last_hidden_state:
+            self.fc2 = torch.nn.Linear(self.model.config.hidden_size, output_hidden_size)
+        if lora:
+            l_config = LoraConfig(
+                task_type=TaskType.FEATURE_EXTRACTION,
+                r=lora.lora_r,
+                lora_alpha=lora.lora_alpha,
+                lora_dropout=lora.lora_dropout,
+                bias="lora_only",
+            )
+            self.model = get_peft_model(self.model, l_config)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
+        """
+        Forward pass of the model.
+        :param input_ids: Indices of input sequence tokens in the vocabulary.
+        :param attention_mask: Mask to avoid performing attention on padding token indices.
+        :param token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        :return: Outputs of the model.
+        """
+        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)
+        text_embeds = self.fc1(outputs[1])
+        last_hidden_state = None
+        if self.last_hidden_state:
+            last_hidden_state = self.fc2(outputs[0])
+        else:
+            last_hidden_state = outputs[0]
+        return TextEncoderOutput(text_embeds=text_embeds, last_hidden_state=last_hidden_state)
+class CLIPVisionEncoderOnlyConfig(PretrainedConfig):
+    model_type = "clip_custom_vision_model"
+    def __init__(self, model_name: str = None, pretrained: bool = True, frozen: bool = False, lora: dict = None, **kwargs):
+        self.model_name = model_name
+        self.pretrained = pretrained
+        self.frozen = frozen
+        self.lora = lora
+        super().__init__(**kwargs)
+class CLIPVisionEncoderOnly(PreTrainedModel):
+    config_class = CLIPVisionEncoderOnlyConfig
+    def __init__(self, config):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        super().__init__(config)
+        if config.pretrained:
+            self.model = CLIPVisionModelWithProjection.from_pretrained(config.model_name)
+        else:
+            base_cfg = CLIPVisionConfig.from_pretrained(config.model_name)
+            self.model = CLIPVisionModelWithProjection(base_cfg)
+        if config.lora:
+            l_config = LoraConfig(
+                r=config.lora.lora_r,
+                lora_alpha=config.lora.lora_alpha,
+                target_modules=[
+                    "k_proj",
+                    "v_proj",
+                    "q_proj",
+                    "out_proj",
+                    "fc1",
+                    "fc2",
+                    "visual_projection",
+                    "text_projection"
+                ],
+                lora_dropout=config.lora.lora_dropout,
+                bias="lora_only",
+            )
+            self.model = get_peft_model(self.model, l_config)
+    def forward(self, data):
+        """
+        Forward pass of the model.
+        """
+        return self.model(**data).image_embeds
+    def parameters(self):
+        return self.model.parameters()
+class OpenCLIPVisionEncoderOnly(torch.nn.Module):
+    def __init__(self, model_name: str, pretrained: bool = True, frozen: bool = False, lora: dict = None):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        super().__init__()
+        if pretrained:
+            model, _ = open_clip.create_model_from_pretrained(f"hf-hub:{model_name}")
+            model = model.visual
+        else:
+            raise NotImplemented
+        self.model = model
+        if lora:
+            l_config = LoraConfig(
+                r=lora.lora_r,
+                lora_alpha=lora.lora_alpha,
+                target_modules=[
+                    "k_proj",
+                    "v_proj",
+                    "q_proj",
+                    "out_proj",
+                    "fc1",
+                    "fc2",
+                    "visual_projection",
+                    "text_projection"
+                ],
+                lora_dropout=lora.lora_dropout,
+                bias="lora_only",
+            )
+            self.model = get_peft_model(self.model, l_config)
+    def forward(self, image):
+        """
+        Forward pass of the model.
+        """
+        return self.model(image)
+    def save_pretrained(self, save_dir):
+        tensors = self.model.state_dict()
+        safetensors.torch.save_file(tensors, save_dir / HF_SAFE_WEIGHTS_NAME)
+class CustomPriorModel(torch.nn.Module):
+    def __init__(self, in_hidden_state, out_hidden_state):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        super().__init__()
+        mid_hidden_state = max(in_hidden_state, out_hidden_state)
+        self.fc1 = torch.nn.Linear(in_hidden_state*2, mid_hidden_state)
+        self.relu = torch.nn.ReLU()
+        self.fc2 = torch.nn.Linear(mid_hidden_state, out_hidden_state)
+    def reinitialize_model(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                if len(param.shape) > 1:
+                    torch.nn.init.xavier_uniform_(param)
+                else:
+                    if 'weight' in name:
+                        torch.nn.init.normal_(param)
+                    else:
+                        torch.nn.init.zeros_(param)
+    def forward(self, feats):
+        """
+        Forward pass of the model.
+        """
+        return PriorTransformerOutput(predicted_image_embedding=self.fc2(self.relu(self.fc1(feats))))
+    def save_pretrained(self, save_dir):
+        pass
+        # tensors = self.state_dict()
+        # safetensors.torch.save_file(tensors, os.path.join(save_dir, HF_SAFE_WEIGHTS_NAME_PRIOR))
+def test_text_model(register=False, upload=False):
+    # register the classes
+    if register:
+        AutoConfig.register("clip_custom_text_model", CLIPTextEncoderOnlyConfig)
+        AutoModel.register(CLIPTextEncoderOnlyConfig, CLIPTextEncoderOnly)
+        CLIPTextEncoderOnlyConfig.register_for_auto_class()
+        CLIPTextEncoderOnly.register_for_auto_class("AutoModel")
+    if upload:
+        # Initialize the model
+        model_name = "openai/clip-vit-base-patch32"
+        pretrained=True
+        lora=None
+        cfg = CLIPTextEncoderOnlyConfig(model_name=model_name, pretrained=pretrained, lora=lora)
+        model = CLIPTextEncoderOnly(cfg)
+        model.push_to_hub("test-text-hf-upload")
+        model = CLIPTextEncoderOnly.from_pretrained("mpatel57/test-text-hf-upload", force_download=True)
+def test_vision_model(register=False, upload=False):
+    # register the classes
+    if register:
+        AutoConfig.register("clip_custom_vision_model", CLIPVisionEncoderOnlyConfig)
+        AutoModel.register(CLIPVisionEncoderOnlyConfig, CLIPVisionEncoderOnly)
+        CLIPVisionEncoderOnlyConfig.register_for_auto_class()
+        CLIPVisionEncoderOnly.register_for_auto_class("AutoModel")
+    if upload:
+        # Initialize the model
+        model_name = "openai/clip-vit-base-patch32"
+        pretrained=True
+        lora=None
+        cfg = CLIPVisionEncoderOnlyConfig(model_name=model_name, pretrained=pretrained, lora=lora)
+        model = CLIPVisionEncoderOnly(cfg)
+        model.push_to_hub("test-vision-hf-upload")
+        model = CLIPVisionEncoderOnly.from_pretrained("mpatel57/test-vision-hf-upload", force_download=True)
+if __name__ == "__main__":
+    test_text_model(register=False, upload=True)
+    test_vision_model(register=False, upload=True)

utils.py ADDED Viewed

	@@ -0,0 +1,332 @@

+from transformers import AutoConfig, AutoModel, PretrainedConfig, CLIPTextConfig, CLIPVisionConfig, PreTrainedModel, CLIPTextModelWithProjection, CLIPVisionModelWithProjection
+from transformers.utils import ModelOutput
+import torch
+import open_clip
+from dataclasses import dataclass
+import safetensors.torch
+from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
+import os
+HF_SAFE_WEIGHTS_NAME = "open_clip_model.safetensors"
+HF_SAFE_WEIGHTS_NAME_PRIOR = "prior_model.safetensors"
+@dataclass
+class PriorTransformerOutput(ModelOutput):
+    """
+    The output of [`PriorTransformer`].
+    Args:
+        predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
+            The predicted CLIP image embedding conditioned on the CLIP text embedding input.
+    """
+    predicted_image_embedding: torch.FloatTensor
+@dataclass
+class TextEncoderOutput(ModelOutput):
+    """
+    Output class for CLIPTextEncoderOnly model to store the outputs in a Hugging Face transformer style.
+    Attributes:
+        prompt_embeds (torch.Tensor): The embeddings of the input prompts.
+        last_hidden_states (torch.Tensor): The last hidden states from the model.
+    """
+    text_embeds: torch.FloatTensor = None
+    last_hidden_state: torch.FloatTensor = None
+class CLIPTextEncoderOnlyConfig(CLIPTextConfig):
+    model_type = "clip_custom_text_model"
+    def __init__(self, model_name: str = None, pretrained: bool = True, frozen: bool = False, lora: dict = None, **kwargs):
+        self.model_name = model_name
+        self.pretrained = pretrained
+        self.frozen = frozen
+        self.lora = lora
+        super().__init__(**kwargs)
+class CLIPTextEncoderOnly(PreTrainedModel):
+    config_class = CLIPTextEncoderOnlyConfig
+    def __init__(self, config):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        super().__init__(config)
+        if config.pretrained:
+            self.model = CLIPTextModelWithProjection.from_pretrained(config.model_name)
+        else:
+            base_cfg = CLIPTextConfig.from_pretrained(config.model_name)
+            self.model = CLIPTextModelWithProjection(base_cfg)
+        if config.lora:
+            l_config = LoraConfig(
+                r=config.lora.lora_r,
+                lora_alpha=config.lora.lora_alpha,
+                target_modules=[
+                    "k_proj",
+                    "v_proj",
+                    "q_proj",
+                    "out_proj",
+                    "fc1",
+                    "fc2",
+                    "visual_projection",
+                    "text_projection"
+                ],
+                lora_dropout=config.lora.lora_dropout,
+                bias="lora_only",
+            )
+            self.model = get_peft_model(self.model, l_config)
+    def forward(self, input_ids, attention_mask=None, position_ids=None):
+        """
+        Forward pass of the model.
+        :param input_ids: Indices of input sequence tokens in the vocabulary.
+        :param attention_mask: Mask to avoid performing attention on padding token indices.
+        :param token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        :return: Outputs of the model.
+        """
+        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_hidden_states=True)
+        return TextEncoderOutput(text_embeds=outputs.text_embeds, last_hidden_state=outputs.last_hidden_state)
+class CustomTextEncoderOnly(PreTrainedModel):
+    def __init__(self, model_name: str, output_hidden_size: int, pretrained: bool = True, frozen: bool = True, last_hidden_state: bool = False, lora: dict = None):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        config = AutoModel.from_pretrained(model_name).config
+        super().__init__(config)
+        self.last_hidden_state = last_hidden_state
+        if pretrained:
+            self.model = AutoModel.from_pretrained(model_name)
+            if frozen:
+                for param in self.model.parameters():
+                    param.requires_grad = False
+        else:
+            self.model = AutoModel(config)
+        self.fc1 = torch.nn.Linear(self.model.config.hidden_size, output_hidden_size)
+        if last_hidden_state:
+            self.fc2 = torch.nn.Linear(self.model.config.hidden_size, output_hidden_size)
+        if lora:
+            l_config = LoraConfig(
+                task_type=TaskType.FEATURE_EXTRACTION,
+                r=lora.lora_r,
+                lora_alpha=lora.lora_alpha,
+                lora_dropout=lora.lora_dropout,
+                bias="lora_only",
+            )
+            self.model = get_peft_model(self.model, l_config)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
+        """
+        Forward pass of the model.
+        :param input_ids: Indices of input sequence tokens in the vocabulary.
+        :param attention_mask: Mask to avoid performing attention on padding token indices.
+        :param token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        :return: Outputs of the model.
+        """
+        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)
+        text_embeds = self.fc1(outputs[1])
+        last_hidden_state = None
+        if self.last_hidden_state:
+            last_hidden_state = self.fc2(outputs[0])
+        else:
+            last_hidden_state = outputs[0]
+        return TextEncoderOutput(text_embeds=text_embeds, last_hidden_state=last_hidden_state)
+class CLIPVisionEncoderOnlyConfig(PretrainedConfig):
+    model_type = "clip_custom_vision_model"
+    def __init__(self, model_name: str = None, pretrained: bool = True, frozen: bool = False, lora: dict = None, **kwargs):
+        self.model_name = model_name
+        self.pretrained = pretrained
+        self.frozen = frozen
+        self.lora = lora
+        super().__init__(**kwargs)
+class CLIPVisionEncoderOnly(PreTrainedModel):
+    config_class = CLIPVisionEncoderOnlyConfig
+    def __init__(self, config):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        super().__init__(config)
+        if config.pretrained:
+            self.model = CLIPVisionModelWithProjection.from_pretrained(config.model_name)
+        else:
+            base_cfg = CLIPVisionConfig.from_pretrained(config.model_name)
+            self.model = CLIPVisionModelWithProjection(base_cfg)
+        if config.lora:
+            l_config = LoraConfig(
+                r=config.lora.lora_r,
+                lora_alpha=config.lora.lora_alpha,
+                target_modules=[
+                    "k_proj",
+                    "v_proj",
+                    "q_proj",
+                    "out_proj",
+                    "fc1",
+                    "fc2",
+                    "visual_projection",
+                    "text_projection"
+                ],
+                lora_dropout=config.lora.lora_dropout,
+                bias="lora_only",
+            )
+            self.model = get_peft_model(self.model, l_config)
+    def forward(self, data):
+        """
+        Forward pass of the model.
+        """
+        return self.model(**data).image_embeds
+    def parameters(self):
+        return self.model.parameters()
+class OpenCLIPVisionEncoderOnly(torch.nn.Module):
+    def __init__(self, model_name: str, pretrained: bool = True, frozen: bool = False, lora: dict = None):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        super().__init__()
+        if pretrained:
+            model, _ = open_clip.create_model_from_pretrained(f"hf-hub:{model_name}")
+            model = model.visual
+        else:
+            raise NotImplemented
+        self.model = model
+        if lora:
+            l_config = LoraConfig(
+                r=lora.lora_r,
+                lora_alpha=lora.lora_alpha,
+                target_modules=[
+                    "k_proj",
+                    "v_proj",
+                    "q_proj",
+                    "out_proj",
+                    "fc1",
+                    "fc2",
+                    "visual_projection",
+                    "text_projection"
+                ],
+                lora_dropout=lora.lora_dropout,
+                bias="lora_only",
+            )
+            self.model = get_peft_model(self.model, l_config)
+    def forward(self, image):
+        """
+        Forward pass of the model.
+        """
+        return self.model(image)
+    def save_pretrained(self, save_dir):
+        tensors = self.model.state_dict()
+        safetensors.torch.save_file(tensors, save_dir / HF_SAFE_WEIGHTS_NAME)
+class CustomPriorModel(torch.nn.Module):
+    def __init__(self, in_hidden_state, out_hidden_state):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        super().__init__()
+        mid_hidden_state = max(in_hidden_state, out_hidden_state)
+        self.fc1 = torch.nn.Linear(in_hidden_state*2, mid_hidden_state)
+        self.relu = torch.nn.ReLU()
+        self.fc2 = torch.nn.Linear(mid_hidden_state, out_hidden_state)
+    def reinitialize_model(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                if len(param.shape) > 1:
+                    torch.nn.init.xavier_uniform_(param)
+                else:
+                    if 'weight' in name:
+                        torch.nn.init.normal_(param)
+                    else:
+                        torch.nn.init.zeros_(param)
+    def forward(self, feats):
+        """
+        Forward pass of the model.
+        """
+        return PriorTransformerOutput(predicted_image_embedding=self.fc2(self.relu(self.fc1(feats))))
+    def save_pretrained(self, save_dir):
+        pass
+        # tensors = self.state_dict()
+        # safetensors.torch.save_file(tensors, os.path.join(save_dir, HF_SAFE_WEIGHTS_NAME_PRIOR))
+def test_text_model(register=False, upload=False):
+    # register the classes
+    if register:
+        AutoConfig.register("clip_custom_text_model", CLIPTextEncoderOnlyConfig)
+        AutoModel.register(CLIPTextEncoderOnlyConfig, CLIPTextEncoderOnly)
+        CLIPTextEncoderOnlyConfig.register_for_auto_class()
+        CLIPTextEncoderOnly.register_for_auto_class("AutoModel")
+    if upload:
+        # Initialize the model
+        model_name = "openai/clip-vit-base-patch32"
+        pretrained=True
+        lora=None
+        cfg = CLIPTextEncoderOnlyConfig(model_name=model_name, pretrained=pretrained, lora=lora)
+        model = CLIPTextEncoderOnly(cfg)
+        model.push_to_hub("test-text-hf-upload")
+        model = CLIPTextEncoderOnly.from_pretrained("mpatel57/test-text-hf-upload", force_download=True)
+def test_vision_model(register=False, upload=False):
+    # register the classes
+    if register:
+        AutoConfig.register("clip_custom_vision_model", CLIPVisionEncoderOnlyConfig)
+        AutoModel.register(CLIPVisionEncoderOnlyConfig, CLIPVisionEncoderOnly)
+        CLIPVisionEncoderOnlyConfig.register_for_auto_class()
+        CLIPVisionEncoderOnly.register_for_auto_class("AutoModel")
+    if upload:
+        # Initialize the model
+        model_name = "openai/clip-vit-base-patch32"
+        pretrained=True
+        lora=None
+        cfg = CLIPVisionEncoderOnlyConfig(model_name=model_name, pretrained=pretrained, lora=lora)
+        model = CLIPVisionEncoderOnly(cfg)
+        model.push_to_hub("test-vision-hf-upload")
+        model = CLIPVisionEncoderOnly.from_pretrained("mpatel57/test-vision-hf-upload", force_download=True)
+if __name__ == "__main__":
+    test_text_model(register=False, upload=True)
+    test_vision_model(register=False, upload=True)

vision-encoder/config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "architectures": [
+    "CLIPVisionEncoderOnly"
+  ],
+  "auto_map": {
+    "AutoConfig": "utils.CLIPVisionEncoderOnlyConfig",
+    "AutoModel": "utils.CLIPVisionEncoderOnly"
+  },
+  "frozen": false,
+  "lora": null,
+  "model_name": "openai/clip-vit-base-patch32",
+  "model_type": "clip_custom_vision_model",
+  "pretrained": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.1"
+}

vision-encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29cb615e72ca4eebeda4d2ec6ca87e9f39e85dca939260bf6e04e06542d3103c
+size 351421984

vision-encoder/utils.py ADDED Viewed

	@@ -0,0 +1,332 @@

+from transformers import AutoConfig, AutoModel, PretrainedConfig, CLIPTextConfig, CLIPVisionConfig, PreTrainedModel, CLIPTextModelWithProjection, CLIPVisionModelWithProjection
+from transformers.utils import ModelOutput
+import torch
+import open_clip
+from dataclasses import dataclass
+import safetensors.torch
+from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
+import os
+HF_SAFE_WEIGHTS_NAME = "open_clip_model.safetensors"
+HF_SAFE_WEIGHTS_NAME_PRIOR = "prior_model.safetensors"
+@dataclass
+class PriorTransformerOutput(ModelOutput):
+    """
+    The output of [`PriorTransformer`].
+    Args:
+        predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
+            The predicted CLIP image embedding conditioned on the CLIP text embedding input.
+    """
+    predicted_image_embedding: torch.FloatTensor
+@dataclass
+class TextEncoderOutput(ModelOutput):
+    """
+    Output class for CLIPTextEncoderOnly model to store the outputs in a Hugging Face transformer style.
+    Attributes:
+        prompt_embeds (torch.Tensor): The embeddings of the input prompts.
+        last_hidden_states (torch.Tensor): The last hidden states from the model.
+    """
+    text_embeds: torch.FloatTensor = None
+    last_hidden_state: torch.FloatTensor = None
+class CLIPTextEncoderOnlyConfig(CLIPTextConfig):
+    model_type = "clip_custom_text_model"
+    def __init__(self, model_name: str = None, pretrained: bool = True, frozen: bool = False, lora: dict = None, **kwargs):
+        self.model_name = model_name
+        self.pretrained = pretrained
+        self.frozen = frozen
+        self.lora = lora
+        super().__init__(**kwargs)
+class CLIPTextEncoderOnly(PreTrainedModel):
+    config_class = CLIPTextEncoderOnlyConfig
+    def __init__(self, config):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        super().__init__(config)
+        if config.pretrained:
+            self.model = CLIPTextModelWithProjection.from_pretrained(config.model_name)
+        else:
+            base_cfg = CLIPTextConfig.from_pretrained(config.model_name)
+            self.model = CLIPTextModelWithProjection(base_cfg)
+        if config.lora:
+            l_config = LoraConfig(
+                r=config.lora.lora_r,
+                lora_alpha=config.lora.lora_alpha,
+                target_modules=[
+                    "k_proj",
+                    "v_proj",
+                    "q_proj",
+                    "out_proj",
+                    "fc1",
+                    "fc2",
+                    "visual_projection",
+                    "text_projection"
+                ],
+                lora_dropout=config.lora.lora_dropout,
+                bias="lora_only",
+            )
+            self.model = get_peft_model(self.model, l_config)
+    def forward(self, input_ids, attention_mask=None, position_ids=None):
+        """
+        Forward pass of the model.
+        :param input_ids: Indices of input sequence tokens in the vocabulary.
+        :param attention_mask: Mask to avoid performing attention on padding token indices.
+        :param token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        :return: Outputs of the model.
+        """
+        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_hidden_states=True)
+        return TextEncoderOutput(text_embeds=outputs.text_embeds, last_hidden_state=outputs.last_hidden_state)
+class CustomTextEncoderOnly(PreTrainedModel):
+    def __init__(self, model_name: str, output_hidden_size: int, pretrained: bool = True, frozen: bool = True, last_hidden_state: bool = False, lora: dict = None):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        config = AutoModel.from_pretrained(model_name).config
+        super().__init__(config)
+        self.last_hidden_state = last_hidden_state
+        if pretrained:
+            self.model = AutoModel.from_pretrained(model_name)
+            if frozen:
+                for param in self.model.parameters():
+                    param.requires_grad = False
+        else:
+            self.model = AutoModel(config)
+        self.fc1 = torch.nn.Linear(self.model.config.hidden_size, output_hidden_size)
+        if last_hidden_state:
+            self.fc2 = torch.nn.Linear(self.model.config.hidden_size, output_hidden_size)
+        if lora:
+            l_config = LoraConfig(
+                task_type=TaskType.FEATURE_EXTRACTION,
+                r=lora.lora_r,
+                lora_alpha=lora.lora_alpha,
+                lora_dropout=lora.lora_dropout,
+                bias="lora_only",
+            )
+            self.model = get_peft_model(self.model, l_config)
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
+        """
+        Forward pass of the model.
+        :param input_ids: Indices of input sequence tokens in the vocabulary.
+        :param attention_mask: Mask to avoid performing attention on padding token indices.
+        :param token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        :return: Outputs of the model.
+        """
+        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)
+        text_embeds = self.fc1(outputs[1])
+        last_hidden_state = None
+        if self.last_hidden_state:
+            last_hidden_state = self.fc2(outputs[0])
+        else:
+            last_hidden_state = outputs[0]
+        return TextEncoderOutput(text_embeds=text_embeds, last_hidden_state=last_hidden_state)
+class CLIPVisionEncoderOnlyConfig(PretrainedConfig):
+    model_type = "clip_custom_vision_model"
+    def __init__(self, model_name: str = None, pretrained: bool = True, frozen: bool = False, lora: dict = None, **kwargs):
+        self.model_name = model_name
+        self.pretrained = pretrained
+        self.frozen = frozen
+        self.lora = lora
+        super().__init__(**kwargs)
+class CLIPVisionEncoderOnly(PreTrainedModel):
+    config_class = CLIPVisionEncoderOnlyConfig
+    def __init__(self, config):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        super().__init__(config)
+        if config.pretrained:
+            self.model = CLIPVisionModelWithProjection.from_pretrained(config.model_name)
+        else:
+            base_cfg = CLIPVisionConfig.from_pretrained(config.model_name)
+            self.model = CLIPVisionModelWithProjection(base_cfg)
+        if config.lora:
+            l_config = LoraConfig(
+                r=config.lora.lora_r,
+                lora_alpha=config.lora.lora_alpha,
+                target_modules=[
+                    "k_proj",
+                    "v_proj",
+                    "q_proj",
+                    "out_proj",
+                    "fc1",
+                    "fc2",
+                    "visual_projection",
+                    "text_projection"
+                ],
+                lora_dropout=config.lora.lora_dropout,
+                bias="lora_only",
+            )
+            self.model = get_peft_model(self.model, l_config)
+    def forward(self, data):
+        """
+        Forward pass of the model.
+        """
+        return self.model(**data).image_embeds
+    def parameters(self):
+        return self.model.parameters()
+class OpenCLIPVisionEncoderOnly(torch.nn.Module):
+    def __init__(self, model_name: str, pretrained: bool = True, frozen: bool = False, lora: dict = None):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        super().__init__()
+        if pretrained:
+            model, _ = open_clip.create_model_from_pretrained(f"hf-hub:{model_name}")
+            model = model.visual
+        else:
+            raise NotImplemented
+        self.model = model
+        if lora:
+            l_config = LoraConfig(
+                r=lora.lora_r,
+                lora_alpha=lora.lora_alpha,
+                target_modules=[
+                    "k_proj",
+                    "v_proj",
+                    "q_proj",
+                    "out_proj",
+                    "fc1",
+                    "fc2",
+                    "visual_projection",
+                    "text_projection"
+                ],
+                lora_dropout=lora.lora_dropout,
+                bias="lora_only",
+            )
+            self.model = get_peft_model(self.model, l_config)
+    def forward(self, image):
+        """
+        Forward pass of the model.
+        """
+        return self.model(image)
+    def save_pretrained(self, save_dir):
+        tensors = self.model.state_dict()
+        safetensors.torch.save_file(tensors, save_dir / HF_SAFE_WEIGHTS_NAME)
+class CustomPriorModel(torch.nn.Module):
+    def __init__(self, in_hidden_state, out_hidden_state):
+        """
+        Initializes the Hugging Face text encoder for CLIP model, inheriting from PreTrainedModel.
+        :param model_name: The name or path of the pretrained model.
+        :param pretrained: Whether to load the pretrained weights.
+        """
+        super().__init__()
+        mid_hidden_state = max(in_hidden_state, out_hidden_state)
+        self.fc1 = torch.nn.Linear(in_hidden_state*2, mid_hidden_state)
+        self.relu = torch.nn.ReLU()
+        self.fc2 = torch.nn.Linear(mid_hidden_state, out_hidden_state)
+    def reinitialize_model(self):
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                if len(param.shape) > 1:
+                    torch.nn.init.xavier_uniform_(param)
+                else:
+                    if 'weight' in name:
+                        torch.nn.init.normal_(param)
+                    else:
+                        torch.nn.init.zeros_(param)
+    def forward(self, feats):
+        """
+        Forward pass of the model.
+        """
+        return PriorTransformerOutput(predicted_image_embedding=self.fc2(self.relu(self.fc1(feats))))
+    def save_pretrained(self, save_dir):
+        pass
+        # tensors = self.state_dict()
+        # safetensors.torch.save_file(tensors, os.path.join(save_dir, HF_SAFE_WEIGHTS_NAME_PRIOR))
+def test_text_model(register=False, upload=False):
+    # register the classes
+    if register:
+        AutoConfig.register("clip_custom_text_model", CLIPTextEncoderOnlyConfig)
+        AutoModel.register(CLIPTextEncoderOnlyConfig, CLIPTextEncoderOnly)
+        CLIPTextEncoderOnlyConfig.register_for_auto_class()
+        CLIPTextEncoderOnly.register_for_auto_class("AutoModel")
+    if upload:
+        # Initialize the model
+        model_name = "openai/clip-vit-base-patch32"
+        pretrained=True
+        lora=None
+        cfg = CLIPTextEncoderOnlyConfig(model_name=model_name, pretrained=pretrained, lora=lora)
+        model = CLIPTextEncoderOnly(cfg)
+        model.push_to_hub("test-text-hf-upload")
+        model = CLIPTextEncoderOnly.from_pretrained("mpatel57/test-text-hf-upload", force_download=True)
+def test_vision_model(register=False, upload=False):
+    # register the classes
+    if register:
+        AutoConfig.register("clip_custom_vision_model", CLIPVisionEncoderOnlyConfig)
+        AutoModel.register(CLIPVisionEncoderOnlyConfig, CLIPVisionEncoderOnly)
+        CLIPVisionEncoderOnlyConfig.register_for_auto_class()
+        CLIPVisionEncoderOnly.register_for_auto_class("AutoModel")
+    if upload:
+        # Initialize the model
+        model_name = "openai/clip-vit-base-patch32"
+        pretrained=True
+        lora=None
+        cfg = CLIPVisionEncoderOnlyConfig(model_name=model_name, pretrained=pretrained, lora=lora)
+        model = CLIPVisionEncoderOnly(cfg)
+        model.push_to_hub("test-vision-hf-upload")
+        model = CLIPVisionEncoderOnly.from_pretrained("mpatel57/test-vision-hf-upload", force_download=True)
+if __name__ == "__main__":
+    test_text_model(register=False, upload=True)
+    test_vision_model(register=False, upload=True)