File size: 3,461 Bytes
1971a9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""TraVisionLM configuration"""

from transformers import PretrainedConfig
from transformers import logging, CONFIG_MAPPING
import warnings

logger = logging.get_logger(__name__)

class TraVisionLMConfig(PretrainedConfig):
    model_type = "travisionlm"
    is_composition = False

    def __init__(

            self,

            vision_config=None,

            text_config=None,

            ignore_index=-100,

            image_token_idx=50257,

            vocab_size=51282,

            projection_dim=768,

            hidden_size=1280,

            **kwargs, 

    ):
        self.ignore_index = ignore_index
        self.image_token_index = image_token_idx
        self._vocab_size = vocab_size
        self.projection_dim = projection_dim
        self.hidden_size = hidden_size
        self.vision_config = vision_config
        self.is_encoder_decoder = False
        if isinstance(self.vision_config, dict):
            vision_config["model_type"] = (
                vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
            )
            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
        elif vision_config is None:
            self.vision_config = CONFIG_MAPPING["siglip_vision_model"](
                attention_dropout=0.0,
                hidden_act="gelu_pytorch_tanh",
                hidden_size=768,
                image_size=256,
                intermediate_size=3072,
                layer_norm_eps=1e-06,
                num_attention_heads=12,
                num_channels=3,
                num_hidden_layers=12,
                patch_size=16,
            )
        self.vocab_size = vocab_size

        self.text_config = text_config

        if isinstance(self.text_config, dict):
            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gpt2"
            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
        elif text_config is None:
            self.text_config = CONFIG_MAPPING["gpt2"](
                activation_function="gelu_new",
                attn_pdrop=0.1,
                embd_pdrop=0.1,
                initializer_range=0.02,
                layer_norm_epsilon=1e-05,
                n_ctx=1024,
                n_embd=1280,
                n_head=20,
                n_layer=36,
                n_positions=1024,
                reorder_and_upcast_attn=False,
                resid_pdrop=0.1,
                scale_attn_by_inverse_layer_idx=False,
                scale_attn_weights=True,
                vocab_size=vocab_size
            )
        self.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2
        self.pad_token_id = self.text_config.pad_token_id
        self.vision_config.projection_dim = projection_dim
        super().__init__(**kwargs)

    @property
    def vocab_size(self):
        warnings.warn(
            "The `vocab_size` attribute is deprecated and will be removed in v4.44, Please use `text_config.vocab_size` instead.",
            FutureWarning,
        )
        return self._vocab_size

    @vocab_size.setter
    def vocab_size(self, value):
        self._vocab_size = value

    def to_dict(self):
        output = super().to_dict()
        output.pop("_vocab_size", None)
        return output