ClementRomac HF staff commited on
Commit
2e8ba46
·
1 Parent(s): e7c16cf

Upload processor

Browse files
preprocessor_config.json CHANGED
@@ -1,4 +1,7 @@
1
  {
 
 
 
2
  "crop_size": {
3
  "height": 224,
4
  "width": 224
 
1
  {
2
+ "auto_map": {
3
+ "AutoProcessor": "processor.GIAProcessor"
4
+ },
5
  "crop_size": {
6
  "height": 224,
7
  "width": 224
processor.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import chain
2
+ from transformers import GitProcessor
3
+
4
+ class GIAProcessor(GitProcessor):
5
+ def __init__(self, image_processor, tokenizer):
6
+ super().__init__(image_processor, tokenizer)
7
+ self._block_size = 1024
8
+
9
+ def _group_texts(self, examples):
10
+ # Concatenate all texts.
11
+ concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
12
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
13
+ # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
14
+ # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
15
+ total_length = (total_length // self._block_size) * self._block_size
16
+ # Split by chunks of max_len.
17
+ result = {
18
+ k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
19
+ for k, t in concatenated_examples.items()
20
+ }
21
+ return result
22
+
23
+ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
24
+ if text is not None and images is None:
25
+ encoded_text = self.tokenizer(text, return_tensors=return_tensors)
26
+ encoding = self._group_texts(encoded_text)
27
+ elif text is not None and images is not None:
28
+ encoding = super().__call__(text, images, return_tensors, **kwargs)
29
+
30
+ return encoding
31
+
32
+ def batch_decode(self, *args, **kwargs):
33
+ return self.tokenizer.batch_decode(*args, **kwargs)
34
+
35
+ def decode(self, *args, **kwargs):
36
+ return self.tokenizer.decode(*args, **kwargs)
37
+
38
+ @property
39
+ def model_input_names(self):
40
+ return ["input_ids", "attention_mask", "pixel_values"]
41
+
42
+
43
+ GIAProcessor.register_for_auto_class("AutoProcessor")
tokenizer_config.json CHANGED
@@ -1,4 +1,7 @@
1
  {
 
 
 
2
  "clean_up_tokenization_spaces": true,
3
  "cls_token": "[CLS]",
4
  "do_lower_case": true,
 
1
  {
2
+ "auto_map": {
3
+ "AutoProcessor": "processor.GIAProcessor"
4
+ },
5
  "clean_up_tokenization_spaces": true,
6
  "cls_token": "[CLS]",
7
  "do_lower_case": true,