update

Files changed (4) hide show

modeling_qwen.py +10 -153
qwen.tiktoken +0 -0
tokenization_qwen.py +432 -0
visual.py +70 -19

modeling_qwen.py CHANGED Viewed

@@ -69,44 +69,7 @@ Pass argument `stream` to model.chat() is buggy, deprecated, and marked for remo
 apply_rotary_emb_func = None
 rms_norm = None
-flash_attn_unpadded_func = None
-def _import_flash_attn():
-    global apply_rotary_emb_func, rms_norm, flash_attn_unpadded_func
-    try:
-        from flash_attn.layers.rotary import apply_rotary_emb_func as __apply_rotary_emb_func
-        apply_rotary_emb_func = __apply_rotary_emb_func
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn rotary fail, please install FlashAttention rotary to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary"
-        )
-    try:
-        from flash_attn.ops.rms_norm import rms_norm as __rms_norm
-        rms_norm = __rms_norm
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn rms_norm fail, please install FlashAttention layer_norm to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm"
-        )
-    try:
-        import flash_attn
-        if not hasattr(flash_attn, '__version__'):
-            from flash_attn.flash_attn_interface import flash_attn_unpadded_func as __flash_attn_unpadded_func
-        else:
-            if int(flash_attn.__version__.split(".")[0]) >= 2:
-                from flash_attn.flash_attn_interface import flash_attn_varlen_func as __flash_attn_unpadded_func
-            else:
-                from flash_attn.flash_attn_interface import flash_attn_unpadded_func as __flash_attn_unpadded_func
-        flash_attn_unpadded_func = __flash_attn_unpadded_func
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn fail, please install FlashAttention to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention"
-        )
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
@@ -141,70 +104,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-class FlashSelfAttention(torch.nn.Module):
-    def __init__(
-        self,
-        causal=False,
-        softmax_scale=None,
-        attention_dropout=0.0,
-    ):
-        super().__init__()
-        assert flash_attn_unpadded_func is not None, (
-            "Please install FlashAttention first, " "e.g., with pip install flash-attn"
-        )
-        assert (
-            rearrange is not None
-        ), "Please install einops first, e.g., with pip install einops"
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.dropout_p = attention_dropout
-    def forward(self, q, k, v):
-        assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q, k, v)))
-        assert all((i.is_cuda for i in (q, k, v)))
-        batch_size, seqlen_q = q.shape[0], q.shape[1]
-        seqlen_k = k.shape[1]
-        q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
-        cu_seqlens_q = torch.arange(
-            0,
-            (batch_size + 1) * seqlen_q,
-            step=seqlen_q,
-            dtype=torch.int32,
-            device=q.device,
-        )
-        if self.training:
-            assert seqlen_k == seqlen_q
-            is_causal = self.causal
-            cu_seqlens_k = cu_seqlens_q
-        else:
-            is_causal = seqlen_q == seqlen_k
-            cu_seqlens_k = torch.arange(
-                0,
-                (batch_size + 1) * seqlen_k,
-                step=seqlen_k,
-                dtype=torch.int32,
-                device=q.device,
-            )
-            self.dropout_p = 0
-        output = flash_attn_unpadded_func(
-            q,
-            k,
-            v,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            seqlen_q,
-            seqlen_k,
-            self.dropout_p,
-            softmax_scale=self.softmax_scale,
-            causal=is_causal,
-        )
-        output = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-        return output
 class QWenAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -225,7 +124,6 @@ class QWenAttention(nn.Module):
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
-        self.use_flash_attn = config.use_flash_attn
         self.scale_attn_weights = True
         self.projection_size = config.kv_channels * config.num_attention_heads
@@ -242,15 +140,6 @@ class QWenAttention(nn.Module):
         )
         self.is_fp32 = not (config.bf16 or config.fp16)
-        if (
-            self.use_flash_attn
-            and flash_attn_unpadded_func is not None
-            and not self.is_fp32
-        ):
-            self.core_attention_flash = FlashSelfAttention(
-                causal=True, attention_dropout=config.attn_dropout_prob
-            )
         self.bf16 = config.bf16
         if config.rotary_pct == 1.0:
@@ -453,40 +342,20 @@ class QWenAttention(nn.Module):
             logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
             query = query * logn_tensor.expand_as(query)
-        if (
-            self.use_flash_attn
-            and flash_attn_unpadded_func is not None
-            and not self.is_fp32
-            and query.is_cuda
-        ):
-            q, k, v = query, key, value
-            context_layer = self.core_attention_flash(q, k, v)
-            context_layer = rearrange(
-                context_layer, "b s h d -> b s (h d)"
-            ).contiguous()
-        else:
-            query = query.permute(0, 2, 1, 3)
-            key = key.permute(0, 2, 1, 3)
-            value = value.permute(0, 2, 1, 3)
-            attn_output, attn_weight = self._attn(
-                query, key, value, attention_mask, head_mask
-            )
-            context_layer = self._merge_heads(
-                attn_output, self.num_heads, self.head_dim
-            )
         attn_output = self.c_proj(context_layer)
         outputs = (attn_output, present)
         if output_attentions:
-            if (
-                self.use_flash_attn
-                and flash_attn_unpadded_func is not None
-                and not self.is_fp32
-            ):
-                raise ValueError("Cannot output attentions while using flash-attn")
-            else:
-                outputs += (attn_weight,)
         return outputs
@@ -882,18 +751,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
                 logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
             elif SUPPORT_FP16:
                 logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
-        if config.use_flash_attn == "auto":
-            if config.bf16 or config.fp16:
-                logger.warn("Try importing flash-attention for faster inference...")
-                config.use_flash_attn = True
-            else:
-                config.use_flash_attn = False
-        if config.use_flash_attn and config.fp32:
-            logger.warn("Flash attention will be disabled because it does NOT support fp32.")
-        if config.use_flash_attn:
-            _import_flash_attn()
         self.transformer = QWenModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

 apply_rotary_emb_func = None
 rms_norm = None
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 class QWenAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
         self.scale_attn_weights = True
         self.projection_size = config.kv_channels * config.num_attention_heads
         )
         self.is_fp32 = not (config.bf16 or config.fp16)
         self.bf16 = config.bf16
         if config.rotary_pct == 1.0:
             logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
             query = query * logn_tensor.expand_as(query)
+        query = query.permute(0, 2, 1, 3)
+        key = key.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+        attn_output, attn_weight = self._attn(
+            query, key, value, attention_mask, head_mask
+        )
+        context_layer = self._merge_heads(
+            attn_output, self.num_heads, self.head_dim
+        )
         attn_output = self.c_proj(context_layer)
         outputs = (attn_output, present)
         if output_attentions:
+            outputs += (attn_weight,)
         return outputs
                 logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
             elif SUPPORT_FP16:
                 logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
         self.transformer = QWenModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

qwen.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenization_qwen.py ADDED Viewed

	@@ -0,0 +1,432 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tokenization classes for QWen."""
+import base64
+import logging
+import os
+import requests
+import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union, Any, Callable, Optional
+import tiktoken
+import numpy as np
+from PIL import Image
+from PIL import ImageFont
+from PIL import ImageDraw
+from transformers import PreTrainedTokenizer, AddedToken
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+IMSTART = "<|im_start|>"
+IMEND = "<|im_end|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+SPECIAL_TOKENS = (
+    ENDOFTEXT,
+    IMSTART,
+    IMEND,
+) + EXTRAS
+IMG_TOKEN_SPAN = 256
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+def _list_find(
+    input_list: List[Any],
+    candidates: Tuple[Any],
+    start: int = 0,
+):
+    for i in range(start, len(input_list)):
+        if input_list[i] in candidates:
+            return i
+    return -1
+def _replace_closed_tag(
+    input_tokens: List[Any],
+    start_tags: Union[Any, Tuple[Any]],
+    end_tags: Union[Any, Tuple[Any]],
+    inclusive_replace_func: Callable,
+    exclusive_replace_func: Callable = lambda x: x,
+):
+    if isinstance(start_tags, (str, int)):
+        start_tags = (start_tags,)
+    if isinstance(end_tags, (str, int)):
+        end_tags = (end_tags,)
+    assert len(start_tags) == len(end_tags)
+    output_tokens = []
+    end = 0
+    while True:
+        start = _list_find(input_tokens, start_tags, end)
+        if start == -1:
+            break
+        output_tokens.extend(exclusive_replace_func(input_tokens[end : start]))
+        tag_idx = start_tags.index(input_tokens[start])
+        end = _list_find(input_tokens, (end_tags[tag_idx],), start)
+        if end == -1:
+            raise ValueError("Unclosed image token")
+        output_tokens.extend(inclusive_replace_func(input_tokens[start : end + 1]))
+        end += 1
+    output_tokens.extend(exclusive_replace_func(input_tokens[end : ]))
+    return output_tokens
+class QWenTokenizer(PreTrainedTokenizer):
+    """QWen tokenizer."""
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        image_start_tag='<img>',
+        image_end_tag='</img>',
+        image_pad_tag='<imgpad>',
+        ref_start_tag='<ref>',
+        ref_end_tag='</ref>',
+        box_start_tag='<box>',
+        box_end_tag='</box>',
+        quad_start_tag='<quad>',
+        quad_end_tag='</quad>',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.image_start_tag = image_start_tag
+        self.image_end_tag = image_end_tag
+        self.image_pad_tag = image_pad_tag
+        self.ref_start_tag = ref_start_tag
+        self.ref_end_tag = ref_end_tag
+        self.box_start_tag = box_start_tag
+        self.box_end_tag = box_end_tag
+        self.quad_start_tag = quad_start_tag
+        self.quad_end_tag = quad_end_tag
+        self.IMAGE_ST = (
+            ref_start_tag, ref_end_tag,
+            box_start_tag, box_end_tag,
+            quad_start_tag, quad_end_tag,
+            image_start_tag, image_end_tag,
+            image_pad_tag
+        )
+        self.errors = errors  # how to handle errors in decoding
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
+        self.special_tokens = {
+            token: index
+            for index, token in enumerate(
+                SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)
+            )
+        }
+        self.img_start_id = self.special_tokens[self.image_start_tag]
+        self.img_end_id = self.special_tokens[self.image_end_tag]
+        self.img_pad_id = self.special_tokens[self.image_pad_tag]
+        self.ref_start_id = self.special_tokens[self.ref_start_tag]
+        self.ref_end_id = self.special_tokens[self.ref_end_tag]
+        self.box_start_id = self.special_tokens[self.box_start_tag]
+        self.box_end_id = self.special_tokens[self.box_end_tag]
+        self.quad_start_id = self.special_tokens[self.quad_start_tag]
+        self.quad_end_id = self.special_tokens[self.quad_end_tag]
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.tokenizer = enc  # type: tiktoken.Encoding
+        self.eod_id = self.tokenizer.eot_token
+        self.im_start_id = self.special_tokens[IMSTART]
+        self.im_end_id = self.special_tokens[IMEND]
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+    def convert_tokens_to_ids(
+        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError('Adding regular tokens is not supported')
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS + self.IMAGE_ST:
+                raise ValueError('Adding unknown special tokens is not supported')
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "qwen.tiktoken")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        def _encode_imgurl(img_tokens):
+            assert img_tokens[0] == self.image_start_tag and img_tokens[-1] == self.image_end_tag
+            img_tokens = img_tokens[1:-1]
+            img_url = b''.join(img_tokens)
+            out_img_tokens = list(map(self.decoder.get, img_url))
+            if len(out_img_tokens) > IMG_TOKEN_SPAN:
+                raise ValueError("The content in {}..{} is too long".format(
+                    self.image_start_tag, self.image_end_tag))
+            out_img_tokens.extend([self.image_pad_tag] * (IMG_TOKEN_SPAN - len(out_img_tokens)))
+            out_img_tokens = [self.image_start_tag] + out_img_tokens + [self.image_end_tag]
+            return out_img_tokens
+        return _replace_closed_tag(tokens, self.image_start_tag, self.image_end_tag, _encode_imgurl)
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        def _decode_imgurl(img_token_ids):
+            assert img_token_ids[0] == self.img_start_id and img_token_ids[-1] == self.img_end_id
+            img_token_ids = img_token_ids[1:-1]
+            img_token_ids = img_token_ids[ : img_token_ids.index(self.img_pad_id)]
+            img_url = bytes(img_token_ids).decode('utf-8')
+            return [self.img_start_id] + self.tokenizer.encode(img_url) + [self.img_end_id]
+        token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
+    def to_list_format(self, text: str):
+        text = unicodedata.normalize("NFC", text)
+        token_ids = self.tokenizer.encode(
+            text, allowed_special=set(self.IMAGE_ST + (ENDOFTEXT,)))
+        def _encode_vl_info(tokens):
+            if len(tokens) == 0:
+                return []
+            if tokens[0] == self.img_start_id and tokens[-1] == self.img_end_id:
+                key = 'image'
+            elif tokens[0] == self.ref_start_id and tokens[-1] == self.ref_end_id:
+                key = 'ref'
+            elif tokens[0] == self.box_start_id and tokens[-1] == self.box_end_id:
+                key = 'box'
+            elif tokens[0] == self.quad_start_id and tokens[-1] == self.quad_end_id:
+                key = 'quad'
+            else:
+                _tobytes = lambda x: x.encode('utf-8') if isinstance(x, str) else x
+                return [{'text': b''.join(map(_tobytes, map(self.decoder.get, tokens))).decode('utf-8')}]
+            val = b''.join(map(self.decoder.get, tokens[1:-1])).decode('utf-8')
+            return [{key: val}]
+        return _replace_closed_tag(
+            token_ids,
+            (self.img_start_id, self.ref_start_id, self.box_start_id, self.quad_start_id),
+            (self.img_end_id, self.ref_end_id, self.box_end_id, self.quad_end_id),
+            _encode_vl_info,
+            _encode_vl_info,
+        )
+    def from_list_format(self, list_format: List[Dict]):
+        text = ''
+        for ele in list_format:
+            if 'image' in ele:
+                text += self.image_start_tag + ele['image'] + self.image_end_tag
+            elif 'text' in ele:
+                text += ele['text']
+            elif 'box' in ele:
+                if 'ref' in ele:
+                    text += self.ref_start_tag + ele['ref'] + self.ref_end_tag
+                for box in ele['box']:
+                    text += self.box_start_tag + '(%d,%d),(%d,%d)' % (box[0], box[1], box[2], box[3]) + self.box_end_tag
+            else:
+                raise ValueError("Unsupport element: " + str(ele))
+        return text
+    def _fetch_latest_picture(self, response, history):
+        if history is None:
+            history = []
+        _history = history + [(response, None)]
+        for q, r in _history[::-1]:
+            for ele in self.to_list_format(q)[::-1]:
+                if 'image' in ele:
+                    return ele['image']
+        return None
+    def _fetch_all_box_with_ref(self, text):
+        list_format = self.to_list_format(text)
+        output = []
+        for i, ele in enumerate(list_format):
+            if 'box' in ele:
+                bbox = tuple(map(int, ele['box'].replace('(', '').replace(')', '').split(',')))
+                assert len(bbox) == 4
+                output.append({'box': bbox})
+                ref_idx = i - 1
+                while ref_idx >= 0 and 'box' in list_format[ref_idx]:
+                    ref_idx -= 1
+                if ref_idx >= 0 and 'ref' in list_format[ref_idx]:
+                    output[-1]['ref'] = list_format[ref_idx]['ref'].strip()
+        return output
+    def draw_bbox_on_latest_picture(
+        self,
+        response,
+        history=None,
+    ) -> Optional[Image.Image]:
+        image = self._fetch_latest_picture(response, history)
+        if image is None:
+            return None
+        if image.startswith("http://") or image.startswith("https://"):
+            image = Image.open(requests.get(image, stream=True).raw)
+        else:
+            image = Image.open(image)
+        h, w = image.height, image.width
+        image = image.convert("RGB")
+        boxes = self._fetch_all_box_with_ref(response)
+        if not boxes:
+            return None
+        fnt = ImageFont.truetype("SimSun.ttf", 50)
+        draw = ImageDraw.Draw(image)
+        for box in boxes:
+            x1, y1, x2, y2 = box['box']
+            x1, y1, x2, y2 = (int(x1 / 1000 * w), int(y1 / 1000 * h), int(x2 / 1000 * w), int(y2 / 1000 * h))
+            draw.rectangle((x1, y1, x2, y2), outline='red', width=4)
+            if 'ref' in box:
+                draw.text((x1, y1), box['ref'], fill='yellow', font=fnt)
+        return image

visual.py CHANGED Viewed

@@ -1,3 +1,8 @@
 from collections import OrderedDict
 import math
 import requests
@@ -5,11 +10,11 @@ from io import BytesIO
 from functools import partial
 from PIL import Image
 from typing import Callable, Optional, Sequence, Tuple, List
 import torch
 from torch import nn
 from torch.nn import functional as F
-from torch.utils.checkpoint import checkpoint
 from torch.nn.init import trunc_normal_
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
@@ -33,8 +38,64 @@ def get_abs_pos(abs_pos, tgt_size):
     else:
         return abs_pos
 class Resampler(nn.Module):
     def __init__(
             self,
             grid_size,
@@ -48,7 +109,9 @@ class Resampler(nn.Module):
         self.embed_dim = embed_dim
         self.num_heads = num_heads
-        self.pos_embed = nn.Parameter(torch.randn(embed_dim, grid_size)).requires_grad_(False)
         self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
         trunc_normal_(self.query, std=.02)
@@ -234,7 +297,7 @@ class VisualAttentionBlock(nn.Module):
         return x
-class Transformer(nn.Module):
     def __init__(
             self,
             width: int,
@@ -247,7 +310,6 @@ class Transformer(nn.Module):
         super().__init__()
         self.width = width
         self.layers = layers
-        self.grad_checkpointing = False
         self.resblocks = nn.ModuleList([
             VisualAttentionBlock(
@@ -263,11 +325,7 @@ class Transformer(nn.Module):
     def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
         for r in self.resblocks:
-            if self.grad_checkpointing and not torch.jit.is_scripting():
-                # TODO: handle kwargs https://github.com/pytorch/pytorch/issues/79887#issuecomment-1161758372
-                x = checkpoint(r, x, None, None, attn_mask)
-            else:
-                x = r(x, attn_mask=attn_mask)
         return x
@@ -306,13 +364,13 @@ class VisionTransformer(nn.Module):
         # class embeddings and positional embeddings
         scale = width ** -0.5
-        self.positional_embedding = nn.Parameter(scale * torch.randn(self.grid_size[0] * self.grid_size[1], width))
         norm_layer = partial(nn.LayerNorm, eps=1e-6)
         act_layer = nn.GELU
         self.ln_pre = norm_layer(width)
-        self.transformer = Transformer(
             width,
             layers,
             heads,
@@ -331,10 +389,6 @@ class VisionTransformer(nn.Module):
         self.ln_post = norm_layer(output_dim)
         self.proj = nn.Parameter((output_dim** -0.5) * torch.randn(output_dim, output_dim))
-    @torch.jit.ignore
-    def set_grad_checkpointing(self, enable=True):
-        self.transformer.grad_checkpointing = enable
     def forward(self, x: torch.Tensor):
         x = x.to(
             dtype=self.transformer.get_cast_dtype(),
@@ -353,8 +407,7 @@ class VisionTransformer(nn.Module):
         x = self.transformer(x)
         x = x.permute(1, 0, 2)  # LND -> NLD
-        if self.attn_pool:
-            x = self.attn_pool(x)
         x = self.ln_post(x)
         x = x @ self.proj
@@ -365,8 +418,6 @@ class VisionTransformer(nn.Module):
         for image_path in image_paths:
             if image_path.startswith("http://") or image_path.startswith("https://"):
                 image = Image.open(requests.get(image_path, stream=True).raw)
-            elif image_path.startswith("oss://"):
-                raise NotImplementedError
             else:
                 image = Image.open(image_path)
             image = image.convert("RGB")

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
 from collections import OrderedDict
 import math
 import requests
 from functools import partial
 from PIL import Image
 from typing import Callable, Optional, Sequence, Tuple, List
+import numpy as np
 import torch
 from torch import nn
 from torch.nn import functional as F
 from torch.nn.init import trunc_normal_
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
     else:
         return abs_pos
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
 class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
     def __init__(
             self,
             grid_size,
         self.embed_dim = embed_dim
         self.num_heads = num_heads
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
+        ).requires_grad_(False)
         self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
         trunc_normal_(self.query, std=.02)
         return x
+class TransformerBlock(nn.Module):
     def __init__(
             self,
             width: int,
         super().__init__()
         self.width = width
         self.layers = layers
         self.resblocks = nn.ModuleList([
             VisualAttentionBlock(
     def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
         for r in self.resblocks:
+            x = r(x, attn_mask=attn_mask)
         return x
         # class embeddings and positional embeddings
         scale = width ** -0.5
+        self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
         norm_layer = partial(nn.LayerNorm, eps=1e-6)
         act_layer = nn.GELU
         self.ln_pre = norm_layer(width)
+        self.transformer = TransformerBlock(
             width,
             layers,
             heads,
         self.ln_post = norm_layer(output_dim)
         self.proj = nn.Parameter((output_dim** -0.5) * torch.randn(output_dim, output_dim))
     def forward(self, x: torch.Tensor):
         x = x.to(
             dtype=self.transformer.get_cast_dtype(),
         x = self.transformer(x)
         x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.attn_pool(x)
         x = self.ln_post(x)
         x = x @ self.proj
         for image_path in image_paths:
             if image_path.startswith("http://") or image_path.startswith("https://"):
                 image = Image.open(requests.get(image_path, stream=True).raw)
             else:
                 image = Image.open(image_path)
             image = image.convert("RGB")