VictorSanh commited on
Commit
5ea341f
·
1 Parent(s): f653a92

bunch of updates

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<fake_token_around_image>": 32000,
3
+ "<image>": 32001
4
+ }
modeling_img2html.py CHANGED
@@ -109,7 +109,7 @@ class Img2HTMLBaseModelOutputWithPast(ModelOutput):
109
  @dataclass
110
  class Img2HTMLCausalLMOutputWithPast(ModelOutput):
111
  """
112
- Base class for Idefics causal language model (or autoregressive) outputs.
113
 
114
  Args:
115
  loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
@@ -171,10 +171,10 @@ def expand_inputs_for_generation(
171
  if attention_mask is not None:
172
  model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
173
 
174
- if model_kwargs["image_attention_mask"] is not None:
175
- model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
176
- 0, expanded_return_idx
177
- )
178
 
179
  if model_kwargs["pixel_values"] is not None:
180
  model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)
 
109
  @dataclass
110
  class Img2HTMLCausalLMOutputWithPast(ModelOutput):
111
  """
112
+ Base class for Img2HTML causal language model (or autoregressive) outputs.
113
 
114
  Args:
115
  loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
 
171
  if attention_mask is not None:
172
  model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
173
 
174
+ # if model_kwargs["image_attention_mask"] is not None:
175
+ # model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
176
+ # 0, expanded_return_idx
177
+ # )
178
 
179
  if model_kwargs["pixel_values"] is not None:
180
  model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)
preprocessor_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "auto_map": {
3
- "AutoProcessor": "processing_img2html.Img2HTMLProcessor",
4
- "AutoImageProcessor": "image_processing_img2html.Img2HTMLImageProcessor"
5
  },
6
  "image_num_channels": 3,
7
  "image_mean": [
 
1
  {
2
  "auto_map": {
3
+ "AutoProcessor": "HuggingFaceM4/img2html--processing_img2html.Img2HTMLProcessor",
4
+ "AutoImageProcessor": "HuggingFaceM4/img2html--image_processing_img2html.Img2HTMLImageProcessor"
5
  },
6
  "image_num_channels": 3,
7
  "image_mean": [
processing_img2html.py CHANGED
@@ -24,6 +24,7 @@ from transformers.processing_utils import ProcessorMixin
24
  from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
25
  from transformers.utils import TensorType, is_torch_available
26
 
 
27
 
28
  if is_torch_available():
29
  import torch
@@ -40,7 +41,6 @@ def is_url(string):
40
  result = urlparse(string)
41
  return all([result.scheme, result.netloc])
42
 
43
-
44
  class Img2HTMLProcessor(ProcessorMixin):
45
  r"""
46
  Constructs a Img2HTML processor which wraps a LLama tokenizer and Img2HTML image processor into a single processor.
@@ -60,7 +60,7 @@ class Img2HTMLProcessor(ProcessorMixin):
60
  image_processor_class = "Img2HTMLImageProcessor"
61
  tokenizer_class = "LlamaTokenizerFast"
62
 
63
- def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
64
  if image_processor is None:
65
  raise ValueError("You need to specify an `image_processor`.")
66
  if tokenizer is None:
@@ -76,11 +76,32 @@ class Img2HTMLProcessor(ProcessorMixin):
76
  self.image_processor.image_size,
77
  )
78
 
79
- self.tokenizer_was_trained_with_end_of_utterance_token = (
80
- True
81
- if "<end_of_utterance>" in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
82
- else False
83
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  def __call__(
86
  self,
@@ -90,7 +111,6 @@ class Img2HTMLProcessor(ProcessorMixin):
90
  max_length: Optional[int] = None,
91
  transform: Callable = None,
92
  add_eos_token=False,
93
- add_end_of_utterance_token=None,
94
  debug=False,
95
  return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
96
  ) -> BatchEncoding:
@@ -120,10 +140,6 @@ class Img2HTMLProcessor(ProcessorMixin):
120
  set of transforms will be applied to the images
121
  add_eos_token (`bool`, *optional*, defaults to `False`):
122
  Adds `eos_token` at the end of the final prompt if True`
123
- add_end_of_utterance_token (`bool`, *optional*)
124
- Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
125
- image). If `None` the tokenizer will be checked instead and if this token is found in
126
- `additional_special_tokens` then the value will be `True`.
127
  debug (`bool`, *optional*, defaults to `False`):
128
  `True` value will help debug prompt generation by dumping useful information
129
  return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
@@ -198,18 +214,12 @@ class Img2HTMLProcessor(ProcessorMixin):
198
  In order to help debug prompt generation enable `debug=True` which will show you what's happening.
199
 
200
  """
201
-
202
- # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
203
- if add_end_of_utterance_token is None:
204
- add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
205
-
206
  # turn non-batched prompts into batched
207
  if not any(isinstance(i, list) for i in prompts):
208
  prompts = [prompts]
209
 
210
  fake_token = "<fake_token_around_image>"
211
  image_token = "<image>"
212
- end_of_utterance_token = "<end_of_utterance>"
213
 
214
  def image_tokens(last_was_image):
215
  if last_was_image:
@@ -239,9 +249,6 @@ class Img2HTMLProcessor(ProcessorMixin):
239
  image_objects.append(image)
240
  last_was_image = True
241
  else:
242
- # we add end_of_utterance_token between each subsequent text prompts (but not at the last one!)
243
- if add_end_of_utterance_token and last_was_text:
244
- full_text += end_of_utterance_token
245
  full_text += item
246
  last_was_image = False
247
  else:
 
24
  from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
25
  from transformers.utils import TensorType, is_torch_available
26
 
27
+ from .image_processing_img2html import Img2HTMLImageProcessor
28
 
29
  if is_torch_available():
30
  import torch
 
41
  result = urlparse(string)
42
  return all([result.scheme, result.netloc])
43
 
 
44
  class Img2HTMLProcessor(ProcessorMixin):
45
  r"""
46
  Constructs a Img2HTML processor which wraps a LLama tokenizer and Img2HTML image processor into a single processor.
 
60
  image_processor_class = "Img2HTMLImageProcessor"
61
  tokenizer_class = "LlamaTokenizerFast"
62
 
63
+ def __init__(self, image_processor, tokenizer=None, image_size=960, **kwargs):
64
  if image_processor is None:
65
  raise ValueError("You need to specify an `image_processor`.")
66
  if tokenizer is None:
 
76
  self.image_processor.image_size,
77
  )
78
 
79
+ # @classmethod
80
+ # def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
81
+ # # Hack overriding things
82
+ # from pathlib import Path
83
+ # from transformers.utils import direct_transformers_import
84
+ # # Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
85
+ # transformers_module = direct_transformers_import(Path(__file__).parent)
86
+
87
+ # args = []
88
+ # for attribute_name in cls.attributes:
89
+ # class_name = getattr(cls, f"{attribute_name}_class")
90
+ # if isinstance(class_name, tuple):
91
+ # classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
92
+ # use_fast = kwargs.get("use_fast", True)
93
+ # if use_fast and classes[1] is not None:
94
+ # attribute_class = classes[1]
95
+ # else:
96
+ # attribute_class = classes[0]
97
+ # else:
98
+ # if class_name == "Img2HTMLImageProcessor":
99
+ # attribute_class = Img2HTMLImageProcessor
100
+ # else:
101
+ # attribute_class = getattr(transformers_module, class_name)
102
+
103
+ # args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
104
+ # return args
105
 
106
  def __call__(
107
  self,
 
111
  max_length: Optional[int] = None,
112
  transform: Callable = None,
113
  add_eos_token=False,
 
114
  debug=False,
115
  return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
116
  ) -> BatchEncoding:
 
140
  set of transforms will be applied to the images
141
  add_eos_token (`bool`, *optional*, defaults to `False`):
142
  Adds `eos_token` at the end of the final prompt if True`
 
 
 
 
143
  debug (`bool`, *optional*, defaults to `False`):
144
  `True` value will help debug prompt generation by dumping useful information
145
  return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
 
214
  In order to help debug prompt generation enable `debug=True` which will show you what's happening.
215
 
216
  """
 
 
 
 
 
217
  # turn non-batched prompts into batched
218
  if not any(isinstance(i, list) for i in prompts):
219
  prompts = [prompts]
220
 
221
  fake_token = "<fake_token_around_image>"
222
  image_token = "<image>"
 
223
 
224
  def image_tokens(last_was_image):
225
  if last_was_image:
 
249
  image_objects.append(image)
250
  last_was_image = True
251
  else:
 
 
 
252
  full_text += item
253
  last_was_image = False
254
  else:
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "32000": {
28
+ "content": "<fake_token_around_image>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "32001": {
36
+ "content": "<image>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "additional_special_tokens": [],
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "eos_token": "</s>",
48
+ "legacy": false,
49
+ "model_max_length": 1000000000000000019884624838656,
50
+ "pad_token": "<unk>",
51
+ "sp_model_kwargs": {},
52
+ "spaces_between_special_tokens": false,
53
+ "tokenizer_class": "LlamaTokenizer",
54
+ "unk_token": "<unk>",
55
+ "use_default_system_prompt": true
56
+ }