Commit
·
79b1536
1
Parent(s):
2ffbe07
Upload processor
Browse files- processor.py +7 -8
processor.py
CHANGED
@@ -3,27 +3,26 @@ from transformers import GitProcessor
|
|
3 |
|
4 |
|
5 |
class GIAProcessor(GitProcessor):
|
6 |
-
def __init__(self, image_processor, tokenizer
|
7 |
super().__init__(image_processor, tokenizer)
|
8 |
-
self.max_input_size = max_input_size
|
9 |
|
10 |
-
def _cut_text(self, examples):
|
11 |
results = {
|
12 |
"input_ids": [],
|
13 |
"attention_mask": []
|
14 |
}
|
15 |
for i in range(len(examples["input_ids"])):
|
16 |
_input_size = len(examples["input_ids"][i])
|
17 |
-
for j in range(max(1, _input_size //
|
18 |
-
results["input_ids"].append(examples["input_ids"][i][j*
|
19 |
-
results["attention_mask"].append(examples["attention_mask"][i][j *
|
20 |
|
21 |
return results
|
22 |
|
23 |
-
def __call__(self, examples, return_tensors=None, **kwargs):
|
24 |
if "text" in examples and not "images" in examples:
|
25 |
encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
|
26 |
-
encoding = self._cut_text(encoded_text)
|
27 |
elif "text" in examples and "images" in examples:
|
28 |
encoding = super().__call__(examples["text"], examples["images"], return_tensors, **kwargs)
|
29 |
|
|
|
3 |
|
4 |
|
5 |
class GIAProcessor(GitProcessor):
|
6 |
+
def __init__(self, image_processor, tokenizer):
|
7 |
super().__init__(image_processor, tokenizer)
|
|
|
8 |
|
9 |
+
def _cut_text(self, examples, max_input_size):
|
10 |
results = {
|
11 |
"input_ids": [],
|
12 |
"attention_mask": []
|
13 |
}
|
14 |
for i in range(len(examples["input_ids"])):
|
15 |
_input_size = len(examples["input_ids"][i])
|
16 |
+
for j in range(max(1, _input_size // max_input_size)):
|
17 |
+
results["input_ids"].append(examples["input_ids"][i][j*max_input_size:(j + 1) * max_input_size])
|
18 |
+
results["attention_mask"].append(examples["attention_mask"][i][j * max_input_size:(j + 1) * max_input_size])
|
19 |
|
20 |
return results
|
21 |
|
22 |
+
def __call__(self, examples, max_input_size, return_tensors=None, **kwargs):
|
23 |
if "text" in examples and not "images" in examples:
|
24 |
encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
|
25 |
+
encoding = self._cut_text(encoded_text, max_input_size)
|
26 |
elif "text" in examples and "images" in examples:
|
27 |
encoding = super().__call__(examples["text"], examples["images"], return_tensors, **kwargs)
|
28 |
|