SeanScripts commited on
Commit
e35a071
·
verified ·
1 Parent(s): 50f22a0

Upload 12 files

Browse files
config.json ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "architectures": [
4
+ "NVLM_D"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_nvlm_d.NVLM_D_Config",
8
+ "AutoModel": "modeling_nvlm_d.NVLM_D_Model",
9
+ "AutoModelForCausalLM": "modeling_nvlm_d.NVLM_D_Model"
10
+ },
11
+ "downsample_ratio": 0.5,
12
+ "dynamic_image_size": true,
13
+ "force_image_size": 448,
14
+ "llm_config": {
15
+ "_name_or_path": "Qwen/Qwen2-72B-Instruct",
16
+ "add_cross_attention": false,
17
+ "architectures": [
18
+ "Qwen2ForCausalLM"
19
+ ],
20
+ "attention_bias": true,
21
+ "attention_dropout": 0.0,
22
+ "bad_words_ids": null,
23
+ "begin_suppress_tokens": null,
24
+ "bos_token_id": 151643,
25
+ "chunk_size_feed_forward": 0,
26
+ "cross_attention_hidden_size": null,
27
+ "decoder_start_token_id": null,
28
+ "diversity_penalty": 0.0,
29
+ "do_sample": true,
30
+ "early_stopping": false,
31
+ "encoder_no_repeat_ngram_size": 0,
32
+ "eos_token_id": 151645,
33
+ "exponential_decay_length_penalty": null,
34
+ "finetuning_task": null,
35
+ "forced_bos_token_id": null,
36
+ "forced_eos_token_id": null,
37
+ "hidden_act": "silu",
38
+ "hidden_size": 8192,
39
+ "id2label": {
40
+ "0": "LABEL_0",
41
+ "1": "LABEL_1"
42
+ },
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 29568,
45
+ "is_decoder": false,
46
+ "is_encoder_decoder": false,
47
+ "label2id": {
48
+ "LABEL_0": 0,
49
+ "LABEL_1": 1
50
+ },
51
+ "length_penalty": 1.0,
52
+ "max_length": 20,
53
+ "max_position_embeddings": 32768,
54
+ "min_length": 0,
55
+ "mlp_bias": false,
56
+ "model_type": "llama",
57
+ "no_repeat_ngram_size": 0,
58
+ "num_attention_heads": 64,
59
+ "num_beam_groups": 1,
60
+ "num_beams": 1,
61
+ "num_hidden_layers": 80,
62
+ "num_key_value_heads": 8,
63
+ "num_return_sequences": 1,
64
+ "output_attentions": false,
65
+ "output_hidden_states": false,
66
+ "output_scores": false,
67
+ "pad_token_id": null,
68
+ "prefix": null,
69
+ "pretraining_tp": 1,
70
+ "problem_type": null,
71
+ "pruned_heads": {},
72
+ "remove_invalid_values": false,
73
+ "repetition_penalty": 1.0,
74
+ "return_dict": true,
75
+ "return_dict_in_generate": false,
76
+ "rms_norm_eps": 1e-06,
77
+ "rope_scaling": {
78
+ "factor": 3.0,
79
+ "type": "dynamic"
80
+ },
81
+ "rope_theta": 1000000.0,
82
+ "sep_token_id": null,
83
+ "suppress_tokens": null,
84
+ "task_specific_params": null,
85
+ "temperature": 0.7,
86
+ "tf_legacy_loss": false,
87
+ "tie_encoder_decoder": false,
88
+ "tie_word_embeddings": false,
89
+ "tokenizer_class": null,
90
+ "top_k": 40,
91
+ "top_p": 0.9,
92
+ "torch_dtype": "bfloat16",
93
+ "torchscript": false,
94
+ "transformers_version": "4.39.3",
95
+ "typical_p": 1.0,
96
+ "use_bfloat16": true,
97
+ "use_cache": true,
98
+ "vocab_size": 152064
99
+ },
100
+ "max_dynamic_patch": 6,
101
+ "min_dynamic_patch": 1,
102
+ "model_type": "NVLM_D",
103
+ "ps_version": "v2",
104
+ "quantization_config": {
105
+ "_load_in_4bit": true,
106
+ "_load_in_8bit": false,
107
+ "bnb_4bit_compute_dtype": "bfloat16",
108
+ "bnb_4bit_quant_storage": "uint8",
109
+ "bnb_4bit_quant_type": "nf4",
110
+ "bnb_4bit_use_double_quant": true,
111
+ "llm_int8_enable_fp32_cpu_offload": false,
112
+ "llm_int8_has_fp16_weight": false,
113
+ "llm_int8_skip_modules": null,
114
+ "llm_int8_threshold": 6.0,
115
+ "load_in_4bit": true,
116
+ "load_in_8bit": false,
117
+ "quant_method": "bitsandbytes"
118
+ },
119
+ "select_layer": -1,
120
+ "template": "chatml",
121
+ "torch_dtype": "bfloat16",
122
+ "transformers_version": null,
123
+ "use_backbone_lora": 0,
124
+ "use_llm_lora": 0,
125
+ "use_thumbnail": true,
126
+ "vision_config": {
127
+ "architectures": [
128
+ "InternVisionModel"
129
+ ],
130
+ "attention_dropout": 0.0,
131
+ "drop_path_rate": 0.0,
132
+ "dropout": 0.0,
133
+ "hidden_act": "gelu",
134
+ "hidden_size": 3200,
135
+ "image_size": 448,
136
+ "initializer_factor": 0.1,
137
+ "initializer_range": 1e-10,
138
+ "intermediate_size": 12800,
139
+ "layer_norm_eps": 1e-06,
140
+ "model_type": "intern_vit_6b",
141
+ "norm_type": "rms_norm",
142
+ "num_attention_heads": 25,
143
+ "num_channels": 3,
144
+ "num_hidden_layers": 45,
145
+ "output_attentions": false,
146
+ "output_hidden_states": false,
147
+ "patch_size": 14,
148
+ "qk_normalization": true,
149
+ "qkv_bias": false,
150
+ "return_dict": true,
151
+ "torch_dtype": "bfloat16",
152
+ "transformers_version": "4.39.3",
153
+ "use_bfloat16": true,
154
+ "use_flash_attn": true
155
+ }
156
+ }
configuration_intern_vit.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ import os
7
+ from typing import Union
8
+
9
+ from transformers.configuration_utils import PretrainedConfig
10
+ from transformers.utils import logging
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+
15
+ class InternVisionConfig(PretrainedConfig):
16
+ r"""
17
+ This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
18
+ instantiate a vision encoder according to the specified arguments, defining the model architecture.
19
+
20
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
21
+ documentation from [`PretrainedConfig`] for more information.
22
+
23
+ Args:
24
+ num_channels (`int`, *optional*, defaults to 3):
25
+ Number of color channels in the input images (e.g., 3 for RGB).
26
+ patch_size (`int`, *optional*, defaults to 14):
27
+ The size (resolution) of each patch.
28
+ image_size (`int`, *optional*, defaults to 224):
29
+ The size (resolution) of each image.
30
+ qkv_bias (`bool`, *optional*, defaults to `False`):
31
+ Whether to add a bias to the queries and values in the self-attention layers.
32
+ hidden_size (`int`, *optional*, defaults to 3200):
33
+ Dimensionality of the encoder layers and the pooler layer.
34
+ num_attention_heads (`int`, *optional*, defaults to 25):
35
+ Number of attention heads for each attention layer in the Transformer encoder.
36
+ intermediate_size (`int`, *optional*, defaults to 12800):
37
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
38
+ qk_normalization (`bool`, *optional*, defaults to `True`):
39
+ Whether to normalize the queries and keys in the self-attention layers.
40
+ num_hidden_layers (`int`, *optional*, defaults to 48):
41
+ Number of hidden layers in the Transformer encoder.
42
+ use_flash_attn (`bool`, *optional*, defaults to `True`):
43
+ Whether to use flash attention mechanism.
44
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
45
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
46
+ `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
47
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
48
+ The epsilon used by the layer normalization layers.
49
+ dropout (`float`, *optional*, defaults to 0.0):
50
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
51
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
52
+ Dropout rate for stochastic depth.
53
+ attention_dropout (`float`, *optional*, defaults to 0.0):
54
+ The dropout ratio for the attention probabilities.
55
+ initializer_range (`float`, *optional*, defaults to 0.02):
56
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
57
+ initializer_factor (`float`, *optional*, defaults to 0.1):
58
+ A factor for layer scale.
59
+ """
60
+
61
+ model_type = 'intern_vit_6b'
62
+
63
+ def __init__(
64
+ self,
65
+ num_channels=3,
66
+ patch_size=14,
67
+ image_size=224,
68
+ qkv_bias=False,
69
+ hidden_size=3200,
70
+ num_attention_heads=25,
71
+ intermediate_size=12800,
72
+ qk_normalization=True,
73
+ num_hidden_layers=48,
74
+ use_flash_attn=True,
75
+ hidden_act='gelu',
76
+ norm_type='rms_norm',
77
+ layer_norm_eps=1e-6,
78
+ dropout=0.0,
79
+ drop_path_rate=0.0,
80
+ attention_dropout=0.0,
81
+ initializer_range=0.02,
82
+ initializer_factor=0.1,
83
+ **kwargs,
84
+ ):
85
+ super().__init__(**kwargs)
86
+
87
+ self.hidden_size = hidden_size
88
+ self.intermediate_size = intermediate_size
89
+ self.dropout = dropout
90
+ self.drop_path_rate = drop_path_rate
91
+ self.num_hidden_layers = num_hidden_layers
92
+ self.num_attention_heads = num_attention_heads
93
+ self.num_channels = num_channels
94
+ self.patch_size = patch_size
95
+ self.image_size = image_size
96
+ self.initializer_range = initializer_range
97
+ self.initializer_factor = initializer_factor
98
+ self.attention_dropout = attention_dropout
99
+ self.layer_norm_eps = layer_norm_eps
100
+ self.hidden_act = hidden_act
101
+ self.norm_type = norm_type
102
+ self.qkv_bias = qkv_bias
103
+ self.qk_normalization = qk_normalization
104
+ self.use_flash_attn = use_flash_attn
105
+
106
+ @classmethod
107
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
108
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
109
+
110
+ if 'vision_config' in config_dict:
111
+ config_dict = config_dict['vision_config']
112
+
113
+ if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
114
+ logger.warning(
115
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
116
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
117
+ )
118
+
119
+ return cls.from_dict(config_dict, **kwargs)
configuration_nvlm_d.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License
3
+ # LICENSE is in incl_licenses directory.
4
+ # --------------------------------------------------------
5
+
6
+ import copy
7
+
8
+ from transformers import AutoConfig, Qwen2Config
9
+ from transformers.configuration_utils import PretrainedConfig
10
+ from transformers.utils import logging
11
+
12
+ from .configuration_intern_vit import InternVisionConfig
13
+
14
+ logger = logging.get_logger(__name__)
15
+
16
+
17
+ class NVLM_D_Config(PretrainedConfig):
18
+ model_type = 'NVLM_D'
19
+ is_composition = True
20
+
21
+ def __init__(
22
+ self,
23
+ vision_config=None,
24
+ llm_config=None,
25
+ use_backbone_lora=0,
26
+ use_llm_lora=0,
27
+ select_layer=-1,
28
+ force_image_size=None,
29
+ downsample_ratio=0.5,
30
+ template=None,
31
+ dynamic_image_size=False,
32
+ use_thumbnail=False,
33
+ ps_version='v1',
34
+ min_dynamic_patch=1,
35
+ max_dynamic_patch=6,
36
+ **kwargs
37
+ ):
38
+ super().__init__(**kwargs)
39
+
40
+ # Handle vision_config initialization
41
+ if vision_config is None:
42
+ vision_config = {}
43
+ logger.info('vision_config is None. Initializing InternVisionConfig with default values.')
44
+
45
+ # Handle llm_config initialization
46
+ if llm_config is None:
47
+ llm_config = {}
48
+ logger.info('llm_config is None. Initializing LLM Config with default values.')
49
+
50
+ self.vision_config = InternVisionConfig(**vision_config)
51
+
52
+ # Check for supported architecture
53
+ if llm_config.get('architectures', [None])[0] == 'Qwen2ForCausalLM':
54
+ self.llm_config = Qwen2Config(**llm_config)
55
+ else:
56
+ raise ValueError(f"Unsupported architecture: {llm_config.get('architectures', [None])[0]}")
57
+
58
+ # Assign configuration values
59
+ self.use_backbone_lora = use_backbone_lora
60
+ self.use_llm_lora = use_llm_lora
61
+ self.select_layer = select_layer
62
+ self.force_image_size = force_image_size
63
+ self.downsample_ratio = downsample_ratio
64
+ self.template = template
65
+ self.dynamic_image_size = dynamic_image_size
66
+ self.use_thumbnail = use_thumbnail
67
+ self.ps_version = ps_version # Pixel shuffle version
68
+ self.min_dynamic_patch = min_dynamic_patch
69
+ self.max_dynamic_patch = max_dynamic_patch
70
+
71
+ # Log important parameters
72
+ logger.info(f'vision_select_layer: {self.select_layer}')
73
+ logger.info(f'ps_version: {self.ps_version}')
74
+ logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
75
+ logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
76
+
77
+ def to_dict(self):
78
+ """
79
+ Serializes this instance to a Python dictionary. Overrides the default `PretrainedConfig.to_dict`.
80
+
81
+ Returns:
82
+ Dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
83
+ """
84
+ output = copy.deepcopy(self.__dict__)
85
+ output['vision_config'] = self.vision_config.to_dict()
86
+ output['llm_config'] = self.llm_config.to_dict()
87
+ output['model_type'] = self.model_type
88
+ output['use_backbone_lora'] = self.use_backbone_lora
89
+ output['use_llm_lora'] = self.use_llm_lora
90
+ output['select_layer'] = self.select_layer
91
+ output['force_image_size'] = self.force_image_size
92
+ output['downsample_ratio'] = self.downsample_ratio
93
+ output['template'] = self.template
94
+ output['dynamic_image_size'] = self.dynamic_image_size
95
+ output['use_thumbnail'] = self.use_thumbnail
96
+ output['ps_version'] = self.ps_version
97
+ output['min_dynamic_patch'] = self.min_dynamic_patch
98
+ output['max_dynamic_patch'] = self.max_dynamic_patch
99
+
100
+ return output
conversation.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Adapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py under the Apache License 2.0.
3
+ LICENSE is in incl_licenses directory.
4
+
5
+ Conversation prompt templates.
6
+
7
+ We kindly request that you import fastchat instead of copying this file if you wish to use it.
8
+ If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
9
+ """
10
+
11
+ import dataclasses
12
+ from enum import IntEnum, auto
13
+ from typing import Any, Dict, List, Tuple, Union
14
+
15
+
16
+ class SeparatorStyle(IntEnum):
17
+ """Separator styles."""
18
+
19
+ ADD_COLON_SINGLE = auto()
20
+ ADD_COLON_TWO = auto()
21
+ ADD_COLON_SPACE_SINGLE = auto()
22
+ NO_COLON_SINGLE = auto()
23
+ NO_COLON_TWO = auto()
24
+ ADD_NEW_LINE_SINGLE = auto()
25
+ LLAMA2 = auto()
26
+ CHATGLM = auto()
27
+ CHATML = auto()
28
+ CHATINTERN = auto()
29
+ DOLLY = auto()
30
+ RWKV = auto()
31
+ PHOENIX = auto()
32
+ ROBIN = auto()
33
+ FALCON_CHAT = auto()
34
+ CHATGLM3 = auto()
35
+ INTERNVL_ZH = auto()
36
+ MPT = auto()
37
+
38
+
39
+ @dataclasses.dataclass
40
+ class Conversation:
41
+ """A class that manages prompt templates and keeps all conversation history."""
42
+
43
+ # The name of this template
44
+ name: str
45
+ # The template of the system prompt
46
+ system_template: str = '{system_message}'
47
+ # The system message
48
+ system_message: str = ''
49
+ # The names of two roles
50
+ roles: Tuple[str] = ('USER', 'ASSISTANT')
51
+ # All messages. Each item is (role, message).
52
+ messages: List[List[str]] = ()
53
+ # The number of few shot examples
54
+ offset: int = 0
55
+ # The separator style and configurations
56
+ sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
57
+ sep: str = '\n'
58
+ sep2: str = None
59
+ # Stop criteria (the default one is EOS token)
60
+ stop_str: Union[str, List[str]] = None
61
+ # Stops generation if meeting any token in this list
62
+ stop_token_ids: List[int] = None
63
+
64
+ def get_prompt(self) -> str:
65
+ """Get the prompt for generation."""
66
+ system_prompt = self.system_template.format(system_message=self.system_message)
67
+ if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
68
+ ret = system_prompt + self.sep
69
+ for role, message in self.messages:
70
+ if message:
71
+ ret += role + ': ' + message + self.sep
72
+ else:
73
+ ret += role + ':'
74
+ return ret
75
+ elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
76
+ seps = [self.sep, self.sep2]
77
+ ret = system_prompt + seps[0]
78
+ for i, (role, message) in enumerate(self.messages):
79
+ if message:
80
+ ret += role + ': ' + message + seps[i % 2]
81
+ else:
82
+ ret += role + ':'
83
+ return ret
84
+ elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
85
+ ret = system_prompt + self.sep
86
+ for role, message in self.messages:
87
+ if message:
88
+ ret += role + ': ' + message + self.sep
89
+ else:
90
+ ret += role + ': ' # must be end with a space
91
+ return ret
92
+ elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
93
+ ret = '' if system_prompt == '' else system_prompt + self.sep
94
+ for role, message in self.messages:
95
+ if message:
96
+ ret += role + '\n' + message + self.sep
97
+ else:
98
+ ret += role + '\n'
99
+ return ret
100
+ elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
101
+ ret = system_prompt
102
+ for role, message in self.messages:
103
+ if message:
104
+ ret += role + message + self.sep
105
+ else:
106
+ ret += role
107
+ return ret
108
+ elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
109
+ seps = [self.sep, self.sep2]
110
+ ret = system_prompt
111
+ for i, (role, message) in enumerate(self.messages):
112
+ if message:
113
+ ret += role + message + seps[i % 2]
114
+ else:
115
+ ret += role
116
+ return ret
117
+ elif self.sep_style == SeparatorStyle.RWKV:
118
+ ret = system_prompt
119
+ for i, (role, message) in enumerate(self.messages):
120
+ if message:
121
+ ret += (
122
+ role
123
+ + ': '
124
+ + message.replace('\r\n', '\n').replace('\n\n', '\n')
125
+ )
126
+ ret += '\n\n'
127
+ else:
128
+ ret += role + ':'
129
+ return ret
130
+ elif self.sep_style == SeparatorStyle.LLAMA2:
131
+ seps = [self.sep, self.sep2]
132
+ if self.system_message:
133
+ ret = system_prompt
134
+ else:
135
+ ret = '[INST] '
136
+ for i, (role, message) in enumerate(self.messages):
137
+ tag = self.roles[i % 2]
138
+ if message:
139
+ if i == 0:
140
+ ret += message + ' '
141
+ else:
142
+ ret += tag + ' ' + message + seps[i % 2]
143
+ else:
144
+ ret += tag
145
+ return ret
146
+ elif self.sep_style == SeparatorStyle.CHATGLM:
147
+ # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
148
+ # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
149
+ round_add_n = 1 if self.name == 'chatglm2' else 0
150
+ if system_prompt:
151
+ ret = system_prompt + self.sep
152
+ else:
153
+ ret = ''
154
+
155
+ for i, (role, message) in enumerate(self.messages):
156
+ if i % 2 == 0:
157
+ ret += f'[Round {i//2 + round_add_n}]{self.sep}'
158
+
159
+ if message:
160
+ ret += f'{role}:{message}{self.sep}'
161
+ else:
162
+ ret += f'{role}:'
163
+ return ret
164
+ elif self.sep_style == SeparatorStyle.CHATML:
165
+ ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
166
+ for role, message in self.messages:
167
+ if message:
168
+ ret += role + '\n' + message + self.sep + '\n'
169
+ else:
170
+ ret += role + '\n'
171
+ return ret
172
+ elif self.sep_style == SeparatorStyle.CHATGLM3:
173
+ ret = ''
174
+ if self.system_message:
175
+ ret += system_prompt
176
+ for role, message in self.messages:
177
+ if message:
178
+ ret += role + '\n' + ' ' + message
179
+ else:
180
+ ret += role
181
+ return ret
182
+ elif self.sep_style == SeparatorStyle.CHATINTERN:
183
+ # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
184
+ seps = [self.sep, self.sep2]
185
+ ret = system_prompt
186
+ for i, (role, message) in enumerate(self.messages):
187
+ # if i % 2 == 0:
188
+ # ret += "<s>"
189
+ if message:
190
+ ret += role + ':' + message + seps[i % 2] + '\n'
191
+ else:
192
+ ret += role + ':'
193
+ return ret
194
+ elif self.sep_style == SeparatorStyle.DOLLY:
195
+ seps = [self.sep, self.sep2]
196
+ ret = system_prompt
197
+ for i, (role, message) in enumerate(self.messages):
198
+ if message:
199
+ ret += role + ':\n' + message + seps[i % 2]
200
+ if i % 2 == 1:
201
+ ret += '\n\n'
202
+ else:
203
+ ret += role + ':\n'
204
+ return ret
205
+ elif self.sep_style == SeparatorStyle.PHOENIX:
206
+ ret = system_prompt
207
+ for role, message in self.messages:
208
+ if message:
209
+ ret += role + ': ' + '<s>' + message + '</s>'
210
+ else:
211
+ ret += role + ': ' + '<s>'
212
+ return ret
213
+ elif self.sep_style == SeparatorStyle.ROBIN:
214
+ ret = system_prompt + self.sep
215
+ for role, message in self.messages:
216
+ if message:
217
+ ret += role + ':\n' + message + self.sep
218
+ else:
219
+ ret += role + ':\n'
220
+ return ret
221
+ elif self.sep_style == SeparatorStyle.FALCON_CHAT:
222
+ ret = ''
223
+ if self.system_message:
224
+ ret += system_prompt + self.sep
225
+ for role, message in self.messages:
226
+ if message:
227
+ ret += role + ': ' + message + self.sep
228
+ else:
229
+ ret += role + ':'
230
+
231
+ return ret
232
+ elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
233
+ seps = [self.sep, self.sep2]
234
+ ret = self.system_message + seps[0]
235
+ for i, (role, message) in enumerate(self.messages):
236
+ if message:
237
+ ret += role + ': ' + message + seps[i % 2]
238
+ else:
239
+ ret += role + ':'
240
+ return ret
241
+ elif self.sep_style == SeparatorStyle.MPT:
242
+ ret = system_prompt + self.sep
243
+ for role, message in self.messages:
244
+ if message:
245
+ if type(message) is tuple:
246
+ message, _, _ = message
247
+ ret += role + message + self.sep
248
+ else:
249
+ ret += role
250
+ return ret
251
+ else:
252
+ raise ValueError(f'Invalid style: {self.sep_style}')
253
+
254
+ def set_system_message(self, system_message: str):
255
+ """Set the system message."""
256
+ self.system_message = system_message
257
+
258
+ def append_message(self, role: str, message: str):
259
+ """Append a new message."""
260
+ self.messages.append([role, message])
261
+
262
+ def update_last_message(self, message: str):
263
+ """Update the last output.
264
+
265
+ The last message is typically set to be None when constructing the prompt,
266
+ so we need to update it in-place after getting the response from a model.
267
+ """
268
+ self.messages[-1][1] = message
269
+
270
+ def to_gradio_chatbot(self):
271
+ """Convert the conversation to gradio chatbot format."""
272
+ ret = []
273
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
274
+ if i % 2 == 0:
275
+ ret.append([msg, None])
276
+ else:
277
+ ret[-1][-1] = msg
278
+ return ret
279
+
280
+ def to_openai_api_messages(self):
281
+ """Convert the conversation to OpenAI chat completion format."""
282
+ ret = [{'role': 'system', 'content': self.system_message}]
283
+
284
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
285
+ if i % 2 == 0:
286
+ ret.append({'role': 'user', 'content': msg})
287
+ else:
288
+ if msg is not None:
289
+ ret.append({'role': 'assistant', 'content': msg})
290
+ return ret
291
+
292
+ def copy(self):
293
+ return Conversation(
294
+ name=self.name,
295
+ system_template=self.system_template,
296
+ system_message=self.system_message,
297
+ roles=self.roles,
298
+ messages=[[x, y] for x, y in self.messages],
299
+ offset=self.offset,
300
+ sep_style=self.sep_style,
301
+ sep=self.sep,
302
+ sep2=self.sep2,
303
+ stop_str=self.stop_str,
304
+ stop_token_ids=self.stop_token_ids,
305
+ )
306
+
307
+ def dict(self):
308
+ return {
309
+ 'template_name': self.name,
310
+ 'system_message': self.system_message,
311
+ 'roles': self.roles,
312
+ 'messages': self.messages,
313
+ 'offset': self.offset,
314
+ }
315
+
316
+
317
+ # A global registry for all conversation templates
318
+ conv_templates: Dict[str, Conversation] = {}
319
+
320
+
321
+ def register_conv_template(template: Conversation, override: bool = False):
322
+ """Register a new conversation template."""
323
+ if not override:
324
+ assert (
325
+ template.name not in conv_templates
326
+ ), f'{template.name} has been registered.'
327
+
328
+ conv_templates[template.name] = template
329
+
330
+
331
+ def get_conv_template(name: str) -> Conversation:
332
+ """Get a conversation template."""
333
+ return conv_templates[name].copy()
334
+
335
+
336
+ # Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference
337
+ # is that during training, the preprocessing function for the Hermes-2 template doesn't add
338
+ # <s> at the beginning of the tokenized sequence, while the internlm2-chat template does.
339
+ # Therefore, they are completely equivalent during inference.
340
+
341
+ register_conv_template(
342
+ Conversation(
343
+ name='chatml',
344
+ system_template='<|im_start|>system\n{system_message}',
345
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
346
+ system_message='Answer the questions.',
347
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
348
+ sep_style=SeparatorStyle.MPT,
349
+ sep='<|im_end|>',
350
+ stop_token_ids=[
351
+ 2,
352
+ 92543,
353
+ 92542
354
+ ]
355
+ )
356
+ )
357
+
358
+
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_intern_vit.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License
3
+ # LICENSE is in incl_licenses directory.
4
+ # --------------------------------------------------------
5
+
6
+
7
+ from typing import Optional, Tuple, Union
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ import torch.utils.checkpoint
12
+ from einops import rearrange
13
+ from timm.models.layers import DropPath
14
+ from torch import nn
15
+ from transformers.activations import ACT2FN
16
+ from transformers.modeling_outputs import (BaseModelOutput,
17
+ BaseModelOutputWithPooling)
18
+ from transformers.modeling_utils import PreTrainedModel
19
+ from transformers.utils import logging
20
+
21
+ from .configuration_intern_vit import InternVisionConfig
22
+
23
+ has_flash_attn = False
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+
28
+ """
29
+ The following code is adapted from the
30
+ https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B/blob/main/modeling_intern_vit.py repository
31
+
32
+ We added additional dummy heads to the original num of heads to make the number of heads divisible by 8
33
+ (tensor model parallel size) while having the same output as InternVIT.
34
+ We also turn off flash attn to have deterministic results.
35
+ """
36
+ class InternRMSNorm(nn.Module):
37
+ def __init__(self, hidden_size, eps=1e-6):
38
+ super().__init__()
39
+ self.weight = nn.Parameter(torch.ones(hidden_size))
40
+ self.variance_epsilon = eps
41
+
42
+ def forward(self, hidden_states, var=None):
43
+ input_dtype = hidden_states.dtype
44
+ hidden_states = hidden_states.to(torch.float32)
45
+ if var is None:
46
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
47
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
48
+ else:
49
+ hidden_states = hidden_states * torch.rsqrt(var + self.variance_epsilon)
50
+
51
+ return hidden_states.to(input_dtype) * self.weight
52
+
53
+
54
+ class InternVisionEmbeddings(nn.Module):
55
+ def __init__(self, config: InternVisionConfig):
56
+ super().__init__()
57
+ self.config = config
58
+ self.embed_dim = config.hidden_size
59
+ self.image_size = config.image_size
60
+ self.patch_size = config.patch_size
61
+
62
+ self.class_embedding = nn.Parameter(
63
+ torch.randn(1, 1, self.embed_dim),
64
+ )
65
+
66
+ self.patch_embedding = nn.Conv2d(
67
+ in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
68
+ )
69
+
70
+ self.num_patches = (self.image_size // self.patch_size) ** 2
71
+ self.num_positions = self.num_patches + 1
72
+
73
+ self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
74
+
75
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
76
+ batch_size = pixel_values.shape[0]
77
+ target_dtype = self.patch_embedding.weight.dtype
78
+ patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
79
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
80
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
81
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
82
+ embeddings = embeddings + self.position_embedding.to(target_dtype)
83
+ return embeddings
84
+
85
+
86
+ class InternAttention(nn.Module):
87
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
88
+
89
+ def __init__(self, config: InternVisionConfig):
90
+ super().__init__()
91
+ self.config = config
92
+ self.embed_dim = config.hidden_size
93
+ self.num_heads = config.num_attention_heads
94
+ self.use_flash_attn = config.use_flash_attn and has_flash_attn
95
+ if config.use_flash_attn and not has_flash_attn:
96
+ print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
97
+
98
+ self.head_dim = self.embed_dim // self.num_heads
99
+ if self.head_dim * self.num_heads != self.embed_dim:
100
+ raise ValueError(
101
+ f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
102
+ f' {self.num_heads}).'
103
+ )
104
+
105
+ self.scale = self.head_dim ** -0.5
106
+ # We added additional dummy heads to the original num of heads to make the number of heads divisible by 8.
107
+ self.num_dummy_heads = 7
108
+ self.dummy_dim = (self.num_dummy_heads + self.num_heads) * self.head_dim
109
+ self.qkv = nn.Linear(self.embed_dim, 3 * self.dummy_dim, bias=config.qkv_bias)
110
+ self.attn_drop = nn.Dropout(config.attention_dropout)
111
+ self.proj_drop = nn.Dropout(config.dropout)
112
+
113
+ self.qk_normalization = config.qk_normalization
114
+
115
+ if self.qk_normalization:
116
+ self.q_norm = InternRMSNorm(self.dummy_dim, eps=config.layer_norm_eps)
117
+ self.k_norm = InternRMSNorm(self.dummy_dim, eps=config.layer_norm_eps)
118
+
119
+ if self.use_flash_attn:
120
+ self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
121
+ self.proj = nn.Linear(self.dummy_dim, self.embed_dim)
122
+
123
+ def _naive_attn(self, x):
124
+ B, N, C = x.shape
125
+
126
+ # Bypassing your Linear module to use the weight directly is against best practices and interferes with compatibility.
127
+ #qkv = torch.matmul(x, self.qkv.weight.t()).reshape(B, N, 3, self.num_dummy_heads + self.num_heads,
128
+ # C // self.num_heads).permute(2, 0, 3, 1, 4)
129
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_dummy_heads + self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
130
+
131
+ q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
132
+
133
+ if self.qk_normalization:
134
+ B_, H_, N_, D_ = q.shape
135
+ q_var = q.transpose(1, 2).flatten(-2, -1)[:, :, :self.embed_dim].float().pow(2).sum(-1,
136
+ keepdim=True) / self.embed_dim
137
+ k_var = k.transpose(1, 2).flatten(-2, -1)[:, :, :self.embed_dim].float().pow(2).sum(-1,
138
+ keepdim=True) / self.embed_dim
139
+
140
+ q = self.q_norm(q.transpose(1, 2).flatten(-2, -1), var=q_var).view(B_, N_, H_, D_).transpose(1, 2)
141
+ k = self.k_norm(k.transpose(1, 2).flatten(-2, -1), var=k_var).view(B_, N_, H_, D_).transpose(1, 2)
142
+
143
+ attn = ((q * self.scale) @ k.transpose(-2, -1))
144
+ attn = attn.softmax(dim=-1)
145
+
146
+ attn = self.attn_drop(attn)
147
+ x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
148
+
149
+ # What the hell is wrong with you? Who is this helping?
150
+ #x = torch.matmul(x, self.proj.weight.t()) + self.proj.bias
151
+ x = self.proj(x)
152
+ x = self.proj_drop(x)
153
+ return x
154
+
155
+ def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
156
+ qkv = self.qkv(x)
157
+ qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
158
+
159
+ if self.qk_normalization:
160
+ q, k, v = qkv.unbind(2)
161
+ q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
162
+ k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
163
+ qkv = torch.stack([q, k, v], dim=2)
164
+
165
+ context, _ = self.inner_attn(
166
+ qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
167
+ )
168
+ outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
169
+ outs = self.proj_drop(outs)
170
+ return outs
171
+
172
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
173
+ x = self._naive_attn(hidden_states)
174
+ return x
175
+
176
+
177
+ class InternMLP(nn.Module):
178
+ def __init__(self, config: InternVisionConfig):
179
+ super().__init__()
180
+ self.config = config
181
+ self.act = ACT2FN[config.hidden_act]
182
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
183
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
184
+
185
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
186
+ hidden_states = self.fc1(hidden_states)
187
+ #hidden_states = torch.matmul(hidden_states, self.fc1.weight.t()) + self.fc1.bias
188
+ hidden_states = self.act(hidden_states)
189
+ hidden_states = self.fc2(hidden_states)
190
+ #hidden_states = torch.matmul(hidden_states, self.fc2.weight.t()) + self.fc2.bias
191
+ return hidden_states
192
+
193
+
194
+ class InternVisionEncoderLayer(nn.Module):
195
+ def __init__(self, config: InternVisionConfig, drop_path_rate: float):
196
+ super().__init__()
197
+ self.embed_dim = config.hidden_size
198
+ self.intermediate_size = config.intermediate_size
199
+ self.norm_type = config.norm_type
200
+
201
+ self.attn = InternAttention(config)
202
+ self.mlp = InternMLP(config)
203
+ self.norm1 = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
204
+ self.norm2 = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
205
+
206
+ self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
207
+ self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
208
+ self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
209
+ self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
210
+
211
+ def forward(
212
+ self,
213
+ hidden_states: torch.Tensor,
214
+ ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
215
+ """
216
+ Args:
217
+ hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
218
+ """
219
+ hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
220
+
221
+ hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
222
+
223
+ return hidden_states
224
+
225
+
226
+ class InternVisionEncoder(nn.Module):
227
+ """
228
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
229
+ [`InternEncoderLayer`].
230
+
231
+ Args:
232
+ config (`InternConfig`):
233
+ The corresponding vision configuration for the `InternEncoder`.
234
+ """
235
+
236
+ def __init__(self, config: InternVisionConfig):
237
+ super().__init__()
238
+ self.config = config
239
+ # stochastic depth decay rule
240
+ dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
241
+ self.layers = nn.ModuleList([
242
+ InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
243
+ self.gradient_checkpointing = True
244
+
245
+ def forward(
246
+ self,
247
+ inputs_embeds,
248
+ output_hidden_states: Optional[bool] = None,
249
+ return_dict: Optional[bool] = None,
250
+ ) -> Union[Tuple, BaseModelOutput]:
251
+ r"""
252
+ Args:
253
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
254
+ Embedded representation of the inputs. Should be float, not int tokens.
255
+ output_hidden_states (`bool`, *optional*):
256
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
257
+ for more detail.
258
+ return_dict (`bool`, *optional*):
259
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
260
+ """
261
+ output_hidden_states = (
262
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
263
+ )
264
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
265
+
266
+ encoder_states = () if output_hidden_states else None
267
+ hidden_states = inputs_embeds
268
+
269
+ for idx, encoder_layer in enumerate(self.layers):
270
+ if output_hidden_states:
271
+ encoder_states = encoder_states + (hidden_states,)
272
+ if self.gradient_checkpointing and self.training:
273
+ layer_outputs = torch.utils.checkpoint.checkpoint(
274
+ encoder_layer,
275
+ hidden_states)
276
+ else:
277
+ layer_outputs = encoder_layer(
278
+ hidden_states,
279
+ )
280
+ hidden_states = layer_outputs
281
+
282
+ if output_hidden_states:
283
+ encoder_states = encoder_states + (hidden_states,)
284
+
285
+ if not return_dict:
286
+ return tuple(v for v in [hidden_states, encoder_states] if v is not None)
287
+ return BaseModelOutput(
288
+ last_hidden_state=hidden_states, hidden_states=encoder_states
289
+ )
290
+
291
+
292
+ class InternVisionModel(PreTrainedModel):
293
+ main_input_name = 'pixel_values'
294
+ _supports_flash_attn_2 = True
295
+ config_class = InternVisionConfig
296
+ _no_split_modules = ['InternVisionEncoderLayer']
297
+
298
+ def __init__(self, config: InternVisionConfig):
299
+ super().__init__(config)
300
+ self.config = config
301
+
302
+ self.embeddings = InternVisionEmbeddings(config)
303
+ self.encoder = InternVisionEncoder(config)
304
+
305
+ def resize_pos_embeddings(self, old_size, new_size, patch_size):
306
+ pos_emb = self.embeddings.position_embedding
307
+ _, num_positions, embed_dim = pos_emb.shape
308
+ cls_emb = pos_emb[:, :1, :]
309
+ pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
310
+ pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
311
+ pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
312
+ pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
313
+ self.embeddings.position_embedding = nn.Parameter(pos_emb)
314
+ self.embeddings.image_size = new_size
315
+ logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
316
+
317
+ def get_input_embeddings(self):
318
+ return self.embeddings
319
+
320
+ def forward(
321
+ self,
322
+ pixel_values: Optional[torch.FloatTensor] = None,
323
+ output_hidden_states: Optional[bool] = None,
324
+ return_dict: Optional[bool] = None,
325
+ pixel_embeds: Optional[torch.FloatTensor] = None,
326
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
327
+ output_hidden_states = (
328
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
329
+ )
330
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
331
+
332
+ if pixel_values is None and pixel_embeds is None:
333
+ raise ValueError('You have to specify pixel_values or pixel_embeds')
334
+
335
+ if pixel_embeds is not None:
336
+ hidden_states = pixel_embeds
337
+ else:
338
+ if len(pixel_values.shape) == 4:
339
+ hidden_states = self.embeddings(pixel_values)
340
+ else:
341
+ raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
342
+ encoder_outputs = self.encoder(
343
+ inputs_embeds=hidden_states,
344
+ output_hidden_states=output_hidden_states,
345
+ return_dict=return_dict,
346
+ )
347
+
348
+ last_hidden_state = encoder_outputs.last_hidden_state
349
+
350
+ pooled_output = last_hidden_state[:, 0, :]
351
+
352
+ if not return_dict:
353
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
354
+
355
+ return BaseModelOutputWithPooling(
356
+ last_hidden_state=last_hidden_state,
357
+ pooler_output=pooled_output,
358
+ hidden_states=encoder_outputs.hidden_states,
359
+ attentions=encoder_outputs.attentions,
360
+ )
modeling_nvlm_d.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Adapted from https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B under MIT License
3
+ # LICENSE is in incl_licenses directory.
4
+ # --------------------------------------------------------
5
+
6
+
7
+ import warnings
8
+ from typing import Any, List, Optional, Tuple, Union
9
+
10
+ import torch.utils.checkpoint
11
+ import transformers
12
+ from torch import nn
13
+ from torch.nn import CrossEntropyLoss
14
+ from transformers import AutoModel, GenerationConfig, Qwen2ForCausalLM
15
+ from transformers.modeling_outputs import CausalLMOutputWithPast
16
+ from transformers.modeling_utils import PreTrainedModel
17
+ from transformers.utils import ModelOutput, logging
18
+
19
+ from .configuration_nvlm_d import NVLM_D_Config
20
+ from .conversation import get_conv_template
21
+ from .modeling_intern_vit import InternVisionModel, has_flash_attn
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+
26
+ """
27
+ The following code is adapted from the
28
+ https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B/blob/main/modeling_internvl_chat.py repository
29
+
30
+ The chat function is adapted to handle NVLM 1-D tile-tagging design for dynamic high-resolution images.
31
+ """
32
+ def version_cmp(v1, v2, op='eq'):
33
+ import operator
34
+
35
+ from packaging import version
36
+ op_func = getattr(operator, op)
37
+ return op_func(version.parse(v1), version.parse(v2))
38
+
39
+
40
+ class NVLM_D_Model(PreTrainedModel):
41
+ config_class = NVLM_D_Config
42
+ main_input_name = 'pixel_values'
43
+ _supports_flash_attn_2 = True
44
+ _no_split_modules = ['InternVisionModel', 'Qwen2DecoderLayer']
45
+
46
+ def __init__(self, config: NVLM_D_Config, vision_model=None, language_model=None, use_flash_attn=True):
47
+ super().__init__(config)
48
+
49
+ assert version_cmp(transformers.__version__, '4.36.2', 'ge')
50
+ image_size = config.force_image_size or config.vision_config.image_size
51
+ patch_size = config.vision_config.patch_size
52
+ self.patch_size = patch_size
53
+ self.select_layer = config.select_layer
54
+ self.template = config.template
55
+ self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
56
+ self.downsample_ratio = config.downsample_ratio
57
+ self.ps_version = config.ps_version
58
+ use_flash_attn = use_flash_attn if has_flash_attn else False
59
+ config.vision_config.use_flash_attn = True if use_flash_attn else False
60
+ config.llm_config._attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager'
61
+
62
+ logger.info(f'num_image_token: {self.num_image_token}')
63
+ logger.info(f'ps_version: {self.ps_version}')
64
+ if vision_model is not None:
65
+ self.vision_model = vision_model
66
+ else:
67
+ self.vision_model = InternVisionModel(config.vision_config)
68
+ if language_model is not None:
69
+ self.language_model = language_model
70
+ else:
71
+ if config.llm_config.architectures[0] == 'Qwen2ForCausalLM':
72
+ self.language_model = Qwen2ForCausalLM(config.llm_config)
73
+ else:
74
+ raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
75
+
76
+ vit_hidden_size = config.vision_config.hidden_size
77
+ llm_intermediate_size = config.llm_config.intermediate_size
78
+ llm_hidden_size = config.llm_config.hidden_size
79
+
80
+ self.mlp1 = nn.Sequential(
81
+ nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
82
+ nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_intermediate_size, bias=False),
83
+ nn.GELU(),
84
+ nn.Linear(llm_intermediate_size, llm_hidden_size, bias=False)
85
+ )
86
+
87
+ self.img_context_token_id = None
88
+ self.conv_template = get_conv_template(self.template)
89
+ self.system_message = self.conv_template.system_message
90
+
91
+ def forward(
92
+ self,
93
+ pixel_values: torch.FloatTensor,
94
+ input_ids: torch.LongTensor = None,
95
+ attention_mask: Optional[torch.Tensor] = None,
96
+ position_ids: Optional[torch.LongTensor] = None,
97
+ image_flags: Optional[torch.LongTensor] = None,
98
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
99
+ labels: Optional[torch.LongTensor] = None,
100
+ use_cache: Optional[bool] = None,
101
+ output_attentions: Optional[bool] = None,
102
+ output_hidden_states: Optional[bool] = None,
103
+ return_dict: Optional[bool] = None,
104
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
105
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
106
+
107
+ image_flags = image_flags.squeeze(-1)
108
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
109
+
110
+ vit_embeds = self.extract_feature(pixel_values)
111
+ vit_embeds = vit_embeds[image_flags == 1]
112
+ vit_batch_size = pixel_values.shape[0]
113
+
114
+ B, N, C = input_embeds.shape
115
+ input_embeds = input_embeds.reshape(B * N, C)
116
+
117
+ if torch.distributed.get_rank() == 0:
118
+ print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
119
+
120
+ input_ids = input_ids.reshape(B * N)
121
+ selected = (input_ids == self.img_context_token_id)
122
+ try:
123
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C)
124
+ except Exception as e:
125
+ vit_embeds = vit_embeds.reshape(-1, C)
126
+ print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, '
127
+ f'vit_embeds.shape={vit_embeds.shape}')
128
+ n_token = selected.sum()
129
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token]
130
+
131
+ input_embeds = input_embeds.reshape(B, N, C)
132
+
133
+ outputs = self.language_model(
134
+ inputs_embeds=input_embeds,
135
+ attention_mask=attention_mask,
136
+ position_ids=position_ids,
137
+ past_key_values=past_key_values,
138
+ use_cache=use_cache,
139
+ output_attentions=output_attentions,
140
+ output_hidden_states=output_hidden_states,
141
+ return_dict=return_dict,
142
+ )
143
+ logits = outputs.logits
144
+
145
+ loss = None
146
+ if labels is not None:
147
+ # Shift so that tokens < n predict n
148
+ shift_logits = logits[..., :-1, :].contiguous()
149
+ shift_labels = labels[..., 1:].contiguous()
150
+ # Flatten the tokens
151
+ loss_fct = CrossEntropyLoss()
152
+ shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size)
153
+ shift_labels = shift_labels.view(-1)
154
+ # Enable model parallelism
155
+ shift_labels = shift_labels.to(shift_logits.device)
156
+ loss = loss_fct(shift_logits, shift_labels)
157
+
158
+ if not return_dict:
159
+ output = (logits,) + outputs[1:]
160
+ return (loss,) + output if loss is not None else output
161
+
162
+ return CausalLMOutputWithPast(
163
+ loss=loss,
164
+ logits=logits,
165
+ past_key_values=outputs.past_key_values,
166
+ hidden_states=outputs.hidden_states,
167
+ attentions=outputs.attentions,
168
+ )
169
+
170
+ def pixel_shuffle(self, x, scale_factor=0.5):
171
+ n, w, h, c = x.size()
172
+ # N, W, H, C --> N, W, H * scale, C // scale
173
+ x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
174
+ # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
175
+ x = x.permute(0, 2, 1, 3).contiguous()
176
+ # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
177
+ x = x.view(n, int(h * scale_factor), int(w * scale_factor),
178
+ int(c / (scale_factor * scale_factor)))
179
+ if self.ps_version == 'v1':
180
+ warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
181
+ 'which results in a transposed image.')
182
+ else:
183
+ x = x.permute(0, 2, 1, 3).contiguous()
184
+ return x
185
+
186
+ def extract_feature(self, pixel_values):
187
+ if self.select_layer == -1:
188
+ vit_embeds = self.vision_model(
189
+ pixel_values=pixel_values,
190
+ output_hidden_states=False,
191
+ return_dict=True).last_hidden_state
192
+ else:
193
+ vit_embeds = self.vision_model(
194
+ pixel_values=pixel_values,
195
+ output_hidden_states=True,
196
+ return_dict=True).hidden_states[self.select_layer]
197
+ vit_embeds = vit_embeds[:, 1:, :]
198
+
199
+ h = w = int(vit_embeds.shape[1] ** 0.5)
200
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
201
+ vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
202
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
203
+ vit_embeds = self.mlp1(vit_embeds)
204
+ return vit_embeds
205
+
206
+
207
+ """
208
+ Adapts the chat function to handle NVLM 1-D tile-tagging design for dynamic high-resolution images.
209
+ Additionally, it supports the following:
210
+ - Chat without a system prompt.
211
+ - Chat without an image prompt.
212
+ """
213
+ def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
214
+ num_patches_list=None, IMG_START_TOKEN='<|vision_start|>', IMG_END_TOKEN='<|vision_end|>',
215
+ IMG_CONTEXT_TOKEN='<|vision_pad|>', verbose=False, visual_features=None):
216
+
217
+ if history is None and pixel_values is not None and '<image>' not in question:
218
+ question = '<image>\n' + question
219
+
220
+ if num_patches_list is None:
221
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
222
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
223
+
224
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
225
+ self.img_context_token_id = img_context_token_id
226
+
227
+ template = get_conv_template(self.template)
228
+ template.system_message = self.system_message
229
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
230
+
231
+ history = [] if history is None else history
232
+ for (old_question, old_answer) in history:
233
+ template.append_message(template.roles[0], old_question)
234
+ template.append_message(template.roles[1], old_answer)
235
+ template.append_message(template.roles[0], question)
236
+ template.append_message(template.roles[1], None)
237
+ query = template.get_prompt()
238
+
239
+ if verbose and pixel_values is not None:
240
+ image_bs = pixel_values.shape[0]
241
+ print(f'dynamic ViT batch size: {image_bs}')
242
+
243
+ for num_patches in num_patches_list:
244
+ tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)] + ["<tile_global_thumbnail>"]
245
+ image_tokens = ''
246
+ for tile_pos_identifier in tile_pos_identifiers:
247
+ image_tokens += tile_pos_identifier + IMG_CONTEXT_TOKEN * self.num_image_token
248
+ image_tokens = '<Image>' + image_tokens + '</Image>'
249
+ query = query.replace('<image>', image_tokens, 1)
250
+
251
+ model_inputs = tokenizer(query, return_tensors='pt')
252
+ input_ids = model_inputs['input_ids'].cuda()
253
+ attention_mask = model_inputs['attention_mask'].cuda()
254
+ #generation_config['eos_token_id'] = eos_token_id
255
+ generation_output = self.generate(
256
+ pixel_values=pixel_values,
257
+ visual_features=visual_features,
258
+ input_ids=input_ids,
259
+ attention_mask=attention_mask,
260
+ **generation_config
261
+ )
262
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=False)[0]
263
+ response = response.split(template.sep)[0].strip()
264
+ history.append((question, response))
265
+ if return_history:
266
+ return response, history
267
+ else:
268
+ query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
269
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
270
+ if verbose:
271
+ print(query_to_print, response)
272
+ return response
273
+
274
+ def chat_without_sys_prompt(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
275
+ num_patches_list=None, IMG_START_TOKEN='<|vision_start|>', IMG_END_TOKEN='<|vision_end|>',
276
+ IMG_CONTEXT_TOKEN='<|vision_pad|>', verbose=False, visual_features=None):
277
+
278
+ if history is None and pixel_values is not None and '<image>' not in question:
279
+ question = '<image>\n' + question
280
+
281
+ if num_patches_list is None:
282
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
283
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
284
+
285
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
286
+ self.img_context_token_id = img_context_token_id
287
+
288
+ template = get_conv_template(self.template)
289
+ system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>" # override dummy system prompt
290
+ template.system_message = system_prompt
291
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
292
+
293
+ history = [] if history is None else history
294
+ for (old_question, old_answer) in history:
295
+ template.append_message(template.roles[0], old_question)
296
+ template.append_message(template.roles[1], old_answer)
297
+ template.append_message(template.roles[0], question)
298
+ template.append_message(template.roles[1], None)
299
+ query = template.get_prompt()
300
+
301
+ if verbose and pixel_values is not None:
302
+ image_bs = pixel_values.shape[0]
303
+ print(f'dynamic ViT batch size: {image_bs}')
304
+
305
+ query = query[len(system_prompt):]
306
+
307
+ for num_patches in num_patches_list:
308
+ tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)] + ["<tile_global_thumbnail>"]
309
+ image_tokens = ''
310
+ for tile_pos_identifier in tile_pos_identifiers:
311
+ image_tokens += tile_pos_identifier + IMG_CONTEXT_TOKEN * self.num_image_token
312
+ image_tokens = '<Image>' + image_tokens + '</Image>'
313
+ query = query.replace('<image>', image_tokens, 1)
314
+
315
+ model_inputs = tokenizer(query, return_tensors='pt')
316
+ input_ids = model_inputs['input_ids'].cuda()
317
+ attention_mask = model_inputs['attention_mask'].cuda()
318
+ generation_config['eos_token_id'] = eos_token_id
319
+ generation_output = self.generate(
320
+ pixel_values=pixel_values,
321
+ visual_features=visual_features,
322
+ input_ids=input_ids,
323
+ attention_mask=attention_mask,
324
+ **generation_config
325
+ )
326
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
327
+ response = response.split(template.sep)[0].strip()
328
+ history.append((question, response))
329
+ if return_history:
330
+ return response, history
331
+ else:
332
+ query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
333
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
334
+ if verbose:
335
+ print(query_to_print, response)
336
+ return response
337
+
338
+ def chat_without_chat_prompt(self, tokenizer, pixel_values, question, generation_config,
339
+ num_patches_list=None, IMG_START_TOKEN='<|vision_start|>', IMG_END_TOKEN='<|vision_end|>',
340
+ IMG_CONTEXT_TOKEN='<|vision_pad|>', verbose=False, visual_features=None):
341
+
342
+ if pixel_values is not None and '<image>' not in question:
343
+ question = '<image>\n' + question
344
+
345
+ if num_patches_list is None:
346
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
347
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
348
+
349
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
350
+ self.img_context_token_id = img_context_token_id
351
+
352
+ template = get_conv_template(self.template)
353
+ template.system_message = self.system_message
354
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
355
+
356
+ if verbose and pixel_values is not None:
357
+ image_bs = pixel_values.shape[0]
358
+ print(f'dynamic ViT batch size: {image_bs}')
359
+
360
+ query = question
361
+
362
+ for num_patches in num_patches_list:
363
+ tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)] + ["<tile_global_thumbnail>"]
364
+ image_tokens = ''
365
+ for tile_pos_identifier in tile_pos_identifiers:
366
+ image_tokens += tile_pos_identifier + IMG_CONTEXT_TOKEN * self.num_image_token
367
+ image_tokens = '<Image>' + image_tokens + '</Image>'
368
+ query = query.replace('<image>', image_tokens, 1)
369
+
370
+ model_inputs = tokenizer(query, return_tensors='pt')
371
+ input_ids = model_inputs['input_ids'].cuda()
372
+ attention_mask = model_inputs['attention_mask'].cuda()
373
+ generation_config['eos_token_id'] = eos_token_id
374
+ generation_output = self.generate(
375
+ pixel_values=pixel_values,
376
+ visual_features=visual_features,
377
+ input_ids=input_ids,
378
+ attention_mask=attention_mask,
379
+ **generation_config
380
+ )
381
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
382
+ response = response.split(template.sep)[0].strip()
383
+
384
+ query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
385
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
386
+ if verbose:
387
+ print(query_to_print, response)
388
+ return response
389
+
390
+ @torch.no_grad()
391
+ def generate(
392
+ self,
393
+ pixel_values: Optional[torch.FloatTensor] = None,
394
+ input_ids: Optional[torch.FloatTensor] = None,
395
+ attention_mask: Optional[torch.LongTensor] = None,
396
+ visual_features: Optional[torch.FloatTensor] = None,
397
+ generation_config: Optional[GenerationConfig] = None,
398
+ output_hidden_states: Optional[bool] = None,
399
+ return_dict: Optional[bool] = None,
400
+ use_cache: Optional[bool] = None,
401
+ **generate_kwargs,
402
+ ) -> torch.LongTensor:
403
+
404
+ # assert self.img_context_token_id is not None
405
+ if pixel_values is not None:
406
+ if visual_features is not None:
407
+ vit_embeds = visual_features.cuda()
408
+ vit_embeds = self.mlp1(vit_embeds)
409
+ else:
410
+ vit_embeds = self.extract_feature(pixel_values)
411
+
412
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
413
+ B, N, C = input_embeds.shape
414
+ input_embeds = input_embeds.reshape(B * N, C)
415
+
416
+ input_ids = input_ids.reshape(B * N)
417
+ selected = (input_ids == self.img_context_token_id)
418
+ assert selected.sum() != 0
419
+ input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
420
+
421
+ input_embeds = input_embeds.reshape(B, N, C)
422
+ else:
423
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
424
+
425
+ outputs = self.language_model.generate(
426
+ inputs_embeds=input_embeds,
427
+ attention_mask=attention_mask,
428
+ generation_config=generation_config,
429
+ output_hidden_states=output_hidden_states,
430
+ return_dict=return_dict,
431
+ use_cache=use_cache,
432
+ **generate_kwargs,
433
+ )
434
+
435
+ return outputs
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 448,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.485,
9
+ 0.456,
10
+ 0.406
11
+ ],
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 3,
18
+ "size": 448
19
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff