test3

by amirsala7 - opened Aug 30, 2023

←

Files changed (7) hide show

.gitattributes CHANGED Viewed

@@ -34,5 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 snowman.png filter=lfs diff=lfs merge=lfs -text
-pikachu.png filter=lfs diff=lfs merge=lfs -text
-pikachu_bbox.png filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 snowman.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -5,12 +5,6 @@
 ---
 # Kosmos-2: Grounding Multimodal Large Language Models to the World
-**This model (remote code on the Hub) is deprecated. Please use https://huggingface.co/microsoft/kosmos-2-patch14-224**
-**There are some changes in terms of input formats: see the model card in https://huggingface.co/microsoft/kosmos-2-patch14-224**
-~~**(There is an on going effort to port `Kosmos-2` directly into `transformers`. This repository (remote code) might need some more bug fixes later, including breaking changes.)**~~
 <a href="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" target="_blank"><figure><img src="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" width="384"><figcaption><b>[An image of a snowman warming himself by a fire.]</b></figcaption></figure></a>

 ---
 # Kosmos-2: Grounding Multimodal Large Language Models to the World
 <a href="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" target="_blank"><figure><img src="https://huggingface.co/ydshieh/kosmos-2-patch14-224/resolve/main/annotated_snowman.jpg" width="384"><figcaption><b>[An image of a snowman warming himself by a fire.]</b></figcaption></figure></a>

modeling_kosmos2.py CHANGED Viewed

@@ -22,7 +22,6 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import (
@@ -1008,7 +1007,7 @@ class Kosmos2TextTransformer(nn.Module):
  inputs_embeds = self.embed_tokens(input_ids)
  if img_features is not None:
- inputs_embeds[img_input_mask.to(dtype=torch.bool)] = img_features.view(-1, img_features.size(-1))
  inputs_embeds = inputs_embeds * self.embed_scale

 import torch
 import torch.utils.checkpoint
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import (
  inputs_embeds = self.embed_tokens(input_ids)
  if img_features is not None:
+ inputs_embeds[img_input_mask.to(dtype=torch.bool)] = img_features
  inputs_embeds = inputs_embeds * self.embed_scale

pikachu.png DELETED Viewed

pikachu.webp DELETED Viewed

Binary file (35.4 kB)

pikachu_bbox.png DELETED Viewed

tokenization_kosmos2_fast.py CHANGED Viewed

@@ -137,6 +137,7 @@ class Kosmos2TokenizerFast(PreTrainedTokenizerFast):
  )
  self.vocab_file = vocab_file
  self.eod_token = "</doc>"
@@ -178,10 +179,6 @@ class Kosmos2TokenizerFast(PreTrainedTokenizerFast):
  # we need to set `special_tokens=False` to be the same as in the slow tokenizer.
  self.add_tokens(AddedToken(token, lstrip=True, rstrip=False), special_tokens=False)
- @property
- def can_save_slow_tokenizer(self) -> bool:
- return os.path.isfile(self.vocab_file) if self.vocab_file else False
  def build_inputs_with_special_tokens(
  self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
  ) -> List[int]:

  )
  self.vocab_file = vocab_file
+ self.can_save_slow_tokenizer = False if not self.vocab_file else True
  self.eod_token = "</doc>"
  # we need to set `special_tokens=False` to be the same as in the slow tokenizer.
  self.add_tokens(AddedToken(token, lstrip=True, rstrip=False), special_tokens=False)
  def build_inputs_with_special_tokens(
  self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
  ) -> List[int]: