ClementRomac HF staff commited on
Commit
79b1536
·
1 Parent(s): 2ffbe07

Upload processor

Browse files
Files changed (1) hide show
  1. processor.py +7 -8
processor.py CHANGED
@@ -3,27 +3,26 @@ from transformers import GitProcessor
3
 
4
 
5
  class GIAProcessor(GitProcessor):
6
- def __init__(self, image_processor, tokenizer, max_input_size):
7
  super().__init__(image_processor, tokenizer)
8
- self.max_input_size = max_input_size
9
 
10
- def _cut_text(self, examples):
11
  results = {
12
  "input_ids": [],
13
  "attention_mask": []
14
  }
15
  for i in range(len(examples["input_ids"])):
16
  _input_size = len(examples["input_ids"][i])
17
- for j in range(max(1, _input_size // self.max_input_size)):
18
- results["input_ids"].append(examples["input_ids"][i][j*self.max_input_size:(j + 1) * self.max_input_size])
19
- results["attention_mask"].append(examples["attention_mask"][i][j * self.max_input_size:(j + 1) * self.max_input_size])
20
 
21
  return results
22
 
23
- def __call__(self, examples, return_tensors=None, **kwargs):
24
  if "text" in examples and not "images" in examples:
25
  encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
26
- encoding = self._cut_text(encoded_text)
27
  elif "text" in examples and "images" in examples:
28
  encoding = super().__call__(examples["text"], examples["images"], return_tensors, **kwargs)
29
 
 
3
 
4
 
5
  class GIAProcessor(GitProcessor):
6
+ def __init__(self, image_processor, tokenizer):
7
  super().__init__(image_processor, tokenizer)
 
8
 
9
+ def _cut_text(self, examples, max_input_size):
10
  results = {
11
  "input_ids": [],
12
  "attention_mask": []
13
  }
14
  for i in range(len(examples["input_ids"])):
15
  _input_size = len(examples["input_ids"][i])
16
+ for j in range(max(1, _input_size // max_input_size)):
17
+ results["input_ids"].append(examples["input_ids"][i][j*max_input_size:(j + 1) * max_input_size])
18
+ results["attention_mask"].append(examples["attention_mask"][i][j * max_input_size:(j + 1) * max_input_size])
19
 
20
  return results
21
 
22
+ def __call__(self, examples, max_input_size, return_tensors=None, **kwargs):
23
  if "text" in examples and not "images" in examples:
24
  encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
25
+ encoding = self._cut_text(encoded_text, max_input_size)
26
  elif "text" in examples and "images" in examples:
27
  encoding = super().__call__(examples["text"], examples["images"], return_tensors, **kwargs)
28