abdiharyadi commited on
Commit
58abf68
1 Parent(s): f420881

feat: add actual application

Browse files
Files changed (4) hide show
  1. README.md +5 -5
  2. app.py +109 -3
  3. requirements.txt +4 -0
  4. tokenizers.py +600 -0
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
- title: Kancilgpt
3
- emoji: 📚
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.4.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: (WIP cuz learning it rn) Indonesian Fable Story Generator
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: KancilGPT
3
+ emoji: 🦌
4
+ colorFrom: orange
5
+ colorTo: white
6
  sdk: gradio
7
  sdk_version: 5.4.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ short_description: Indonesian Fable Story Generator
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,7 +1,113 @@
1
  import gradio as gr
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
  demo.launch()
 
1
  import gradio as gr
2
+ from transformers import GPT2LMHeadModel
3
+ from tokenizers import IndoNLGTokenizer
4
 
5
+ gpt_tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indogpt")
6
+ gpt_tokenizer.pad_token = gpt_tokenizer.eos_token
7
+ kancilgpt = GPT2LMHeadModel.from_pretrained("abdiharyadi/kancilgpt")
8
+
9
+ def generate_story(judul: str):
10
+ yield "...", "..."
11
+
12
+ stop = False
13
+ while not stop:
14
+ gpt_input = gpt_tokenizer('<s> awal cerita | judul:', return_tensors='pt')
15
+ gpt_out = kancilgpt.generate(**gpt_input, do_sample=True, max_length=512, pad_token_id=gpt_tokenizer.eos_token_id)
16
+ result = gpt_tokenizer.decode(gpt_out[0])
17
+ _, judul_prompt, isi, *end_part = result.split(" | ")
18
+ end_part = "".join(end_part)
19
+ _, *judul_words = judul_prompt.split()
20
+ judul = " ".join(judul_words)
21
+
22
+
23
+ if "</s>" in judul or "</s>" in isi or "|" in isi or (not any(end_part.startswith(x) for x in ["bersambung", "tamat"])):
24
+ print("Invalid output! Regenerating ....")
25
+ continue
26
+
27
+
28
+ quote_count = 0
29
+ for c in isi:
30
+ if c == "\"":
31
+ quote_count += 1
32
+
33
+ if quote_count % 2 != 0:
34
+ print("Invalid output! Regenerating ....")
35
+ continue
36
+
37
+ stop = True
38
+
39
+ yield judul, isi + " ..."
40
+
41
+ total_isi = isi
42
+
43
+ while not end_part.startswith("tamat"):
44
+ yield judul, total_isi + " ..."
45
+
46
+ i = 0
47
+ in_quote = False
48
+ end_sentence = False
49
+ limit = 1750
50
+ while i < len(isi) and not (end_sentence and (not in_quote) and isi[i] == " " and (len(isi) - i) < limit):
51
+ if isi[i] == "\"":
52
+ in_quote = not in_quote
53
+
54
+ if end_sentence:
55
+ end_sentence = isi[i] not in "abcdefghijklmnopqrstuvwxyz"
56
+ else:
57
+ end_sentence = isi[i] in ".?!"
58
+
59
+ i += 1
60
+ # i == len(isi) or end_sentence or (not in_quote) or isi[i] == " "
61
+
62
+ while i < len(isi) and not (isi[i] in "abcdefghijklmnopqrstuvwxyz\""):
63
+ i += 1
64
+ # i == len(isi) or isi[i] in "abcdefghijklmnopqrstuvwxyz\""
65
+
66
+ if i == len(isi):
67
+ raise ValueError("What???")
68
+
69
+ next_isi = isi[i:]
70
+
71
+ stop = False
72
+ while not stop:
73
+ gpt_input = gpt_tokenizer(f'<s> pertengahan cerita | judul: {judul} | {next_isi}', return_tensors='pt')
74
+ gpt_out = kancilgpt.generate(**gpt_input, do_sample=True, max_length=512, pad_token_id=gpt_tokenizer.eos_token_id)
75
+ result = gpt_tokenizer.decode(gpt_out[0])
76
+
77
+ _, judul_prompt, isi, *end_part = result.split(" | ")
78
+ end_part = "".join(end_part)
79
+ _, *judul_words = judul_prompt.split()
80
+ judul = " ".join(judul_words)
81
+
82
+ if isi[len(next_isi) + 1:].strip() != "":
83
+ print(isi[len(next_isi) + 1:])
84
+
85
+ if "</s>" in isi or "|" in isi or (not any(end_part.startswith(x) for x in ["bersambung", "tamat"])):
86
+ print("Invalid output! Regenerating ....")
87
+ continue
88
+
89
+ quote_count = 0
90
+ for c in isi:
91
+ if c == "\"":
92
+ quote_count += 1
93
+
94
+ if quote_count % 2 != 0:
95
+ print("Invalid output! Regenerating ....")
96
+ continue
97
+
98
+ stop = True
99
+
100
+ total_isi += " " + isi[len(next_isi) + 1:]
101
+
102
+ yield judul, total_isi + "\n\ntamat."
103
+
104
+ demo = gr.Interface(
105
+ fn=generate_story,
106
+ inputs=None,
107
+ outputs=[
108
+ gr.Textbox(label="judul", lines=1),
109
+ gr.Textbox(label="cerita", lines=7)
110
+ ]
111
+ )
112
 
 
113
  demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ sentencepiece
3
+ datasets
4
+ torch
tokenizers.py ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License
15
+ """ Tokenization classes for IndoNLG model."""
16
+
17
+ from typing import Dict, List, Optional, Tuple, Union
18
+ from transformers import PreTrainedTokenizer, BatchEncoding
19
+
20
+ from collections.abc import Mapping
21
+ from transformers.utils import (
22
+ PaddingStrategy,
23
+ TensorType,
24
+ is_tf_available,
25
+ is_torch_available,
26
+ logging,
27
+ to_py_obj,
28
+ )
29
+ import numpy as np
30
+ import sentencepiece as spm
31
+ from transformers.utils.generic import _is_tensorflow, _is_torch
32
+
33
+ logger = logging.get_logger(__name__)
34
+
35
+ VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
36
+
37
+ PRETRAINED_VOCAB_FILES_MAP = {
38
+ "vocab_file": {
39
+ "indobenchmark/indobart": "https://huggingface.co/indobenchmark/indobart/resolve/main/sentencepiece.bpe.model",
40
+ "indobenchmark/indogpt": "https://huggingface.co/indobenchmark/indogpt/resolve/main/sentencepiece.bpe.model",
41
+ "indobenchmark/indobart-v2": "https://huggingface.co/indobenchmark/indobart-v2/resolve/main/sentencepiece.bpe.model"
42
+ }
43
+ }
44
+
45
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
46
+ "indobenchmark/indobart": 768,
47
+ "indobenchmark/indogpt": 768,
48
+ "indobenchmark/indobart-v2": 768
49
+ }
50
+
51
+ SHARED_MODEL_IDENTIFIERS = [
52
+ # Load with
53
+ "indobenchmark/indobart",
54
+ "indobenchmark/indogpt",
55
+ "indobenchmark/indobart-v2"
56
+ ]
57
+
58
+ SPIECE_UNDERLINE = "▁"
59
+
60
+ # Define type aliases and NamedTuples
61
+ TextInput = str
62
+ PreTokenizedInput = List[str]
63
+ EncodedInput = List[int]
64
+ TextInputPair = Tuple[str, str]
65
+ PreTokenizedInputPair = Tuple[List[str], List[str]]
66
+ EncodedInputPair = Tuple[List[int], List[int]]
67
+
68
+ class IndoNLGTokenizer(PreTrainedTokenizer):
69
+ vocab_files_names = VOCAB_FILES_NAMES
70
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
71
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
72
+ model_input_names=['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels']
73
+ input_error_message = "text input must of type `str` (single example), `List[str]` (batch of examples)."
74
+
75
+ def __init__(
76
+ self,
77
+ vocab_file,
78
+ decode_special_token=True,
79
+ bos_token="<s>",
80
+ eos_token="</s>",
81
+ sep_token="</s>",
82
+ cls_token="<s>",
83
+ unk_token="<unk>",
84
+ pad_token="<pad>",
85
+ mask_token="<mask>",
86
+ additional_special_tokens=[],
87
+ **kwargs
88
+ ):
89
+ self.sp_model = spm.SentencePieceProcessor()
90
+ self.sp_model.Load(str(vocab_file))
91
+ self.vocab_file = vocab_file
92
+ self.decode_special_token = decode_special_token
93
+ self.model_max_length = 1024
94
+
95
+ # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
96
+ # sentencepiece vocabulary (this is the case for <s> and </s>
97
+ self.special_tokens_to_ids = {
98
+ "[javanese]": 40000,
99
+ "[sundanese]": 40001,
100
+ "[indonesian]": 40002,
101
+ "<mask>": 40003
102
+ }
103
+ self.special_ids_to_tokens = {v: k for k, v in self.special_tokens_to_ids.items()}
104
+
105
+ # Giving a warning when exists additional_special_tokens outside of dedicated special tokens.
106
+ for token in additional_special_tokens:
107
+ if token not in self.special_tokens_to_ids:
108
+ print(f"Warning: Additional special tokens will be ignored in IndoNLGTokenizer.")
109
+ break
110
+
111
+ # Store Language token ID
112
+ self.javanese_token = '[javanese]'
113
+ self.javanese_token_id = 40000
114
+ self.sundanese_token = '[sundanese]'
115
+ self.sundanese_token_id = 40001
116
+ self.indonesian_token = '[indonesian]'
117
+ self.indonesian_token_id = 40002
118
+
119
+ super().__init__(
120
+ vocab_file=vocab_file,
121
+ bos_token=bos_token,
122
+ eos_token=eos_token,
123
+ unk_token=unk_token,
124
+ sep_token=sep_token,
125
+ cls_token=cls_token,
126
+ pad_token=pad_token,
127
+ mask_token=mask_token,
128
+ additional_special_tokens=additional_special_tokens,
129
+ **kwargs,
130
+ )
131
+ self.special_token_ids = [
132
+ self.bos_token_id, self.eos_token_id, self.sep_token_id, self.cls_token_id,
133
+ self.unk_token_id, self.pad_token_id, self.mask_token_id,
134
+ self.javanese_token_id, self.sundanese_token_id, self.indonesian_token_id
135
+ ]
136
+
137
+ def prepare_input_for_generation(self, inputs, model_type='indobart', lang_token='[indonesian]', decoder_inputs=None,
138
+ decoder_lang_token='[indonesian]', padding='longest', return_tensors=None):
139
+ """
140
+ Build model inputs for a specified `model_type`. There are two possible `model_type`, i.e., indobart and indogpt.
141
+
142
+ When `model_type` is indogpt, `lang_token`, `decoder_inputs`, and `decoder_lang_token` parameters will be ignored
143
+ and the input will be encoded in the gpt2 sequence format as follow:
144
+
145
+ - indogpt sequence: ``<s> X``
146
+
147
+ When `model_type` is indobart, `inputs` and `lang_token` are used as the sequence and language identifier for the indobart encoder,
148
+ while `decoder_inputs` and `decoder_lang_token` are used as the sequence and language identifier of the decoder
149
+
150
+ - indobart encoder sequence: ``X </s> <lang_token_id>``
151
+ - indobart decoder sequences: ``<decoder_lang_token_id> X </s>``
152
+
153
+ Args:
154
+ inputs (:obj:`str` or `List[str]`):
155
+ text sequence or list of text sequences to be tokenized.
156
+ model_type (:obj:`str`, defaults to :obj:`indobart`):
157
+ model type to determine the format of the tokenized sequence. Valid values are `indobart` and `indogpt`.
158
+ lang_token (:obj:`str`, defaults to :obj:`[indonesian]`):
159
+ language token to determine the format of the tokenized sequence. Valid values are `[indonesian]`, `[sundanese], and [javanese]`.
160
+ decoder_inputs (:obj:`str` or `List[str]`, `optional`):
161
+ decoder text sequence or list of text sequences to be tokenized.
162
+ decoder_lang_token (:obj:`str`, defaults to :obj:`[indonesian]`):
163
+ decoder language token to determine the format of the tokenized sequence. Valid values are `[indonesian]`, `[sundanese], and [javanese]`.
164
+ padding (:obj:`str`, defaults to :obj:`longest`):
165
+ padding strategy to pad the tokenized sequences. Valid values are `longest`, `max_length`, and `do_not_pad`.
166
+ return_tensors (:obj:`str`, defaults to :obj:`None`):
167
+ Returned tensor type of the tokenized sequence. When set to `None`, the return type will be List[int]. Valid values are `None`, `pt`, and `tf`
168
+
169
+ Returns:
170
+ :obj:`Dict`: Dictionary with `input_ids`, `attention_mask`, `decoder_input_ids` (optional), and `decoder_attention_mask` (optional)
171
+ """
172
+ if model_type == 'indogpt':
173
+ # Process indogpt input
174
+ if type(inputs) == str:
175
+ return self(f'<s> {inputs}', padding=padding, return_tensors=return_tensors)
176
+ elif type(inputs) == list:
177
+ if len(inputs) == 0 or type(inputs[0]) != str:
178
+ raise ValueError(IndoNLGTokenizer.input_error_message)
179
+ else:
180
+ return self([f'<s> {input_data}' for input_data in inputs], padding=padding, return_tensors=return_tensors)
181
+ else:
182
+ raise ValueError(IndoNLGTokenizer.input_error_message)
183
+ elif model_type == 'indobart':
184
+
185
+ # Process encoder input
186
+ if lang_token not in self.special_tokens_to_ids:
187
+ raise ValueError(f"Unknown lang_token `{lang_token}`, lang_token must be either `[javanese]`, `[sundanese]`, or `[indonesian]`")
188
+ elif type(inputs) == list:
189
+ if len(inputs) == 0 or type(inputs[0]) != str:
190
+ raise ValueError(IndoNLGTokenizer.input_error_message)
191
+ elif type(inputs) != str:
192
+ raise ValueError(IndoNLGTokenizer.input_error_message)
193
+
194
+ lang_id = self.special_tokens_to_ids[lang_token]
195
+ input_batch = self(inputs, return_attention_mask=False)
196
+ if type(inputs) == str:
197
+ input_batch['input_ids'] = [self.bos_token_id] + input_batch['input_ids'] + [self.eos_token_id, lang_id]
198
+ else:
199
+ input_batch['input_ids'] = list(map(lambda input_ids: [self.bos_token_id] + input_ids + [self.eos_token_id, lang_id], input_batch['input_ids']))
200
+
201
+ if decoder_inputs is None:
202
+ # Return encoder input
203
+ return self.pad(input_batch, return_tensors=return_tensors)
204
+ else:
205
+ # Process decoder input
206
+ if decoder_lang_token not in self.special_tokens_to_ids:
207
+ raise ValueError(f"Unknown decoder_lang_token `{decoder_lang_token}`, decoder_lang_token must be either `[javanese]`, `[sundanese]`, or `[indonesian]`")
208
+ elif type(decoder_inputs) == list:
209
+ if len(decoder_inputs) == 0:
210
+ raise ValueError(IndoNLGTokenizer.input_error_message)
211
+ elif type(decoder_inputs[0]) != str:
212
+ raise ValueError(IndoNLGTokenizer.input_error_message)
213
+ elif type(decoder_inputs) != str:
214
+ raise ValueError(IndoNLGTokenizer.input_error_message)
215
+
216
+ decoder_lang_id = self.special_tokens_to_ids[decoder_lang_token]
217
+ decoder_input_batch = self(decoder_inputs, return_attention_mask=False)
218
+
219
+ if type(decoder_inputs) == str:
220
+ labels = [self.bos_token_id] + decoder_input_batch['input_ids'] + [self.eos_token_id, decoder_lang_id]
221
+ decoder_input_batch['input_ids'] = [decoder_lang_id, self.bos_token_id] + decoder_input_batch['input_ids'] + [self.eos_token_id]
222
+ else:
223
+ labels = list(map(lambda input_ids: [self.bos_token_id] + input_ids + [self.eos_token_id, decoder_lang_id], decoder_input_batch['input_ids']))
224
+ decoder_input_batch['input_ids'] = list(map(lambda input_ids: [decoder_lang_id, self.bos_token_id] + input_ids + [self.eos_token_id], decoder_input_batch['input_ids']))
225
+
226
+ # Padding
227
+ input_batch = self.pad(input_batch, return_tensors=return_tensors)
228
+ decoder_input_batch = self.pad(decoder_input_batch, return_tensors=return_tensors)
229
+ labels = self.pad({'input_ids': labels}, return_tensors=return_tensors)['input_ids']
230
+ if not isinstance(labels, (list, tuple)):
231
+ labels[labels == self.pad_token_id] = -100
232
+ else:
233
+ labels = list(map(lambda x: -100 if x == self.pad_token_id else x, labels))
234
+
235
+ # Store into a single dict
236
+ input_batch['decoder_input_ids'] = decoder_input_batch['input_ids']
237
+ input_batch['decoder_attention_mask'] = decoder_input_batch['attention_mask']
238
+ input_batch['labels'] = labels
239
+
240
+ return input_batch
241
+
242
+ def __len__(self):
243
+ return max(self.special_ids_to_tokens) + 1
244
+
245
+ def get_special_tokens_mask(
246
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
247
+ ) -> List[int]:
248
+ """
249
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
250
+ special tokens using the tokenizer ``prepare_for_model`` method.
251
+
252
+ Args:
253
+ token_ids_0 (:obj:`List[int]`):
254
+ List of IDs.
255
+ token_ids_1 (:obj:`List[int]`, `optional`):
256
+ Optional second list of IDs for sequence pairs.
257
+ already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
258
+ Whether or not the token list is already formatted with special tokens for the model.
259
+
260
+ Returns:
261
+ :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
262
+ """
263
+ if already_has_special_tokens:
264
+ return super().get_special_tokens_mask(
265
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
266
+ )
267
+
268
+ if token_ids_1 is None:
269
+ return [1] + ([0] * len(token_ids_0)) + [1]
270
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
271
+
272
+ @property
273
+ def vocab_size(self):
274
+ return 4 + len(self.sp_model)
275
+
276
+ def get_vocab(self):
277
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
278
+ vocab.update(self.added_tokens_encoder)
279
+ return vocab
280
+
281
+ def _tokenize(self, text: str) -> List[str]:
282
+ return self.sp_model.encode(text.lower(), out_type=str)
283
+
284
+ def convert_ids_to_tokens(
285
+ self, ids: Union[int, List[int]], skip_special_tokens: bool = False
286
+ ) -> Union[str, List[str]]:
287
+ """
288
+ Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
289
+ added tokens.
290
+ Args:
291
+ ids (`int` or `List[int]`):
292
+ The token id (or token ids) to convert to tokens.
293
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
294
+ Whether or not to remove special tokens in the decoding.
295
+ Returns:
296
+ `str` or `List[str]`: The decoded token(s).
297
+ """
298
+ if isinstance(ids, int):
299
+ if ids not in self.added_tokens_decoder or ids in self.special_tokens_to_ids:
300
+ return self._convert_id_to_token(ids, skip_special_tokens=skip_special_tokens)
301
+ else:
302
+ return self.added_tokens_decoder[ids].content
303
+ tokens = []
304
+ for index in ids:
305
+ index = int(index)
306
+ if skip_special_tokens and index in (self.all_special_ids + list(self.special_tokens_to_ids.values())):
307
+ continue
308
+ if index not in self.added_tokens_decoder or index in self.special_tokens_to_ids:
309
+ tokens.append(self._convert_id_to_token(index, skip_special_tokens=skip_special_tokens))
310
+ else:
311
+ tokens.append(self.added_tokens_decoder[index].content)
312
+ return tokens
313
+
314
+ def _convert_token_to_id(self, token):
315
+ """ Converts a token (str) in an id using the vocab. """
316
+ if token in self.special_tokens_to_ids:
317
+ return self.special_tokens_to_ids[token]
318
+ return self.sp_model.PieceToId(token)
319
+
320
+ def _convert_id_to_token(self, index, skip_special_tokens=False):
321
+ """Converts an index (integer) in a token (str) using the vocab."""
322
+ if skip_special_tokens and index in self.special_token_ids:
323
+ return ''
324
+
325
+ if index in self.special_ids_to_tokens:
326
+ return self.special_ids_to_tokens[index]
327
+
328
+ token = self.sp_model.IdToPiece(index)
329
+ if '<0x' in token:
330
+ char_rep = chr(int(token[1:-1], 0))
331
+ if char_rep.isprintable():
332
+ return char_rep
333
+ return token
334
+
335
+ def __getstate__(self):
336
+ state = self.__dict__.copy()
337
+ state["sp_model"] = None
338
+ return state
339
+
340
+ def __setstate__(self, d):
341
+ self.__dict__ = d
342
+
343
+ # for backward compatibility
344
+ if not hasattr(self, "sp_model_kwargs"):
345
+ self.sp_model_kwargs = {}
346
+
347
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
348
+ self.sp_model.Load(self.vocab_file)
349
+
350
+ def decode(self, inputs, skip_special_tokens=False, **kwargs):
351
+ outputs = super().decode(inputs, skip_special_tokens=skip_special_tokens, **kwargs)
352
+ return outputs.replace(' ','').replace(SPIECE_UNDERLINE, ' ')
353
+
354
+ def _pad_decoder(
355
+ self,
356
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
357
+ max_length: Optional[int] = None,
358
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
359
+ pad_to_multiple_of: Optional[int] = None,
360
+ return_attention_mask: Optional[bool] = None,
361
+ ) -> dict:
362
+ """
363
+ Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
364
+ Args:
365
+ encoded_inputs:
366
+ Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
367
+ max_length: maximum length of the returned list and optionally padding length (see below).
368
+ Will truncate by taking into account the special tokens.
369
+ padding_strategy: PaddingStrategy to use for padding.
370
+ - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
371
+ - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
372
+ - PaddingStrategy.DO_NOT_PAD: Do not pad
373
+ The tokenizer padding sides are defined in self.padding_side:
374
+ - 'left': pads on the left of the sequences
375
+ - 'right': pads on the right of the sequences
376
+ pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
377
+ This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
378
+ >= 7.5 (Volta).
379
+ return_attention_mask:
380
+ (optional) Set to False to avoid returning attention mask (default: set to model specifics)
381
+ """
382
+ # Load from model defaults
383
+ if return_attention_mask is None:
384
+ return_attention_mask = "decoder_attention_mask" in self.model_input_names
385
+
386
+ required_input = encoded_inputs[self.model_input_names[2]]
387
+
388
+ if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
389
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
390
+
391
+ needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
392
+
393
+ # Initialize attention mask if not present.
394
+ if return_attention_mask and "decoder_attention_mask" not in encoded_inputs:
395
+ encoded_inputs["decoder_attention_mask"] = [1] * len(required_input)
396
+
397
+ if needs_to_be_padded:
398
+ difference = max_length - len(required_input)
399
+
400
+ if self.padding_side == "right":
401
+ if return_attention_mask:
402
+ encoded_inputs["decoder_attention_mask"] = encoded_inputs["decoder_attention_mask"] + [0] * difference
403
+ if "decoder_token_type_ids" in encoded_inputs:
404
+ encoded_inputs["decoder_token_type_ids"] = (
405
+ encoded_inputs["decoder_token_type_ids"] + [self.pad_token_type_id] * difference
406
+ )
407
+ if "decoder_special_tokens_mask" in encoded_inputs:
408
+ encoded_inputs["decoder_special_tokens_mask"] = encoded_inputs["decoder_special_tokens_mask"] + [1] * difference
409
+ encoded_inputs[self.model_input_names[2]] = required_input + [self.pad_token_id] * difference
410
+
411
+ label_input = encoded_inputs[self.model_input_names[4]]
412
+ encoded_inputs[self.model_input_names[4]] = label_input + [-100] * difference
413
+ elif self.padding_side == "left":
414
+ if return_attention_mask:
415
+ encoded_inputs["decoder_attention_mask"] = [0] * difference + encoded_inputs["decoder_attention_mask"]
416
+ if "decoder_token_type_ids" in encoded_inputs:
417
+ encoded_inputs["decoder_token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
418
+ "decoder_token_type_ids"
419
+ ]
420
+ if "decoder_special_tokens_mask" in encoded_inputs:
421
+ encoded_inputs["decoder_special_tokens_mask"] = [1] * difference + encoded_inputs["decoder_special_tokens_mask"]
422
+ encoded_inputs[self.model_input_names[2]] = [self.pad_token_id] * difference + required_input
423
+
424
+ label_input = encoded_inputs[self.model_input_names[4]]
425
+ encoded_inputs[self.model_input_names[4]] = label_input + [-100] * difference
426
+ else:
427
+ raise ValueError("Invalid padding strategy:" + str(self.padding_side))
428
+
429
+ return encoded_inputs
430
+
431
+ def pad(self,
432
+ encoded_inputs: Union[
433
+ BatchEncoding,
434
+ List[BatchEncoding],
435
+ Dict[str, EncodedInput],
436
+ Dict[str, List[EncodedInput]],
437
+ List[Dict[str, EncodedInput]],
438
+ ],
439
+ padding: Union[bool, str, PaddingStrategy] = True,
440
+ max_length: Optional[int] = None,
441
+ pad_to_multiple_of: Optional[int] = None,
442
+ return_attention_mask: Optional[bool] = None,
443
+ return_tensors: Optional[Union[str, TensorType]] = None,
444
+ verbose: bool = True,
445
+ ) -> BatchEncoding:
446
+ """
447
+ Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
448
+ in the batch.
449
+
450
+ Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
451
+ `self.pad_token_id` and `self.pad_token_type_id`)
452
+
453
+ <Tip>
454
+
455
+ If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
456
+ result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
457
+ PyTorch tensors, you will lose the specific device of your tensors however.
458
+
459
+ </Tip>
460
+
461
+ Args:
462
+ encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
463
+ Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
464
+ tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
465
+ List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
466
+ collate function.
467
+
468
+ Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see
469
+ the note above for the return type.
470
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
471
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
472
+ index) among:
473
+
474
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
475
+ sequence if provided).
476
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
477
+ acceptable input length for the model if that argument is not provided.
478
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
479
+ lengths).
480
+ max_length (`int`, *optional*):
481
+ Maximum length of the returned list and optionally padding length (see above).
482
+ pad_to_multiple_of (`int`, *optional*):
483
+ If set will pad the sequence to a multiple of the provided value.
484
+
485
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
486
+ >= 7.5 (Volta).
487
+ return_attention_mask (`bool`, *optional*):
488
+ Whether to return the attention mask. If left to the default, will return the attention mask according
489
+ to the specific tokenizer's default, defined by the `return_outputs` attribute.
490
+
491
+ [What are attention masks?](../glossary#attention-mask)
492
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
493
+ If set, will return tensors instead of list of python integers. Acceptable values are:
494
+
495
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
496
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
497
+ - `'np'`: Return Numpy `np.ndarray` objects.
498
+ verbose (`bool`, *optional*, defaults to `True`):
499
+ Whether or not to print more information and warnings.
500
+ """
501
+ # If we have a list of dicts, let's convert it in a dict of lists
502
+ # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
503
+ if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
504
+ encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
505
+
506
+ # The model's main input name, usually `input_ids`, has be passed for padding
507
+ if self.model_input_names[0] not in encoded_inputs:
508
+ raise ValueError(
509
+ "You should supply an encoding or a list of encodings to this method "
510
+ f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
511
+ )
512
+
513
+ required_input = encoded_inputs[self.model_input_names[0]]
514
+
515
+ if not required_input:
516
+ if return_attention_mask:
517
+ encoded_inputs["attention_mask"] = []
518
+ return encoded_inputs
519
+
520
+ # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
521
+ # and rebuild them afterwards if no return_tensors is specified
522
+ # Note that we lose the specific device the tensor may be on for PyTorch
523
+
524
+ first_element = required_input[0]
525
+ if isinstance(first_element, (list, tuple)):
526
+ # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
527
+ for item in required_input:
528
+ if len(item) != 0:
529
+ first_element = item[0]
530
+ break
531
+ # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
532
+ if not isinstance(first_element, (int, list, tuple)):
533
+ if is_tf_available() and _is_tensorflow(first_element):
534
+ return_tensors = "tf" if return_tensors is None else return_tensors
535
+ elif is_torch_available() and _is_torch(first_element):
536
+ return_tensors = "pt" if return_tensors is None else return_tensors
537
+ elif isinstance(first_element, np.ndarray):
538
+ return_tensors = "np" if return_tensors is None else return_tensors
539
+ else:
540
+ raise ValueError(
541
+ f"type of {first_element} unknown: {type(first_element)}. "
542
+ f"Should be one of a python, numpy, pytorch or tensorflow object."
543
+ )
544
+
545
+ for key, value in encoded_inputs.items():
546
+ encoded_inputs[key] = to_py_obj(value)
547
+
548
+ # Convert padding_strategy in PaddingStrategy
549
+ padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
550
+ padding=padding, max_length=max_length, verbose=verbose
551
+ )
552
+
553
+ required_input = encoded_inputs[self.model_input_names[0]]
554
+ if required_input and not isinstance(required_input[0], (list, tuple)):
555
+ encoded_inputs = self._pad(
556
+ encoded_inputs,
557
+ max_length=max_length,
558
+ padding_strategy=padding_strategy,
559
+ pad_to_multiple_of=pad_to_multiple_of,
560
+ return_attention_mask=return_attention_mask,
561
+ )
562
+ return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
563
+
564
+ batch_size = len(required_input)
565
+ assert all(
566
+ len(v) == batch_size for v in encoded_inputs.values()
567
+ ), "Some items in the output dictionary have a different batch size than others."
568
+
569
+ if padding_strategy == PaddingStrategy.LONGEST:
570
+ max_length = max(len(inputs) for inputs in required_input)
571
+ padding_strategy = PaddingStrategy.MAX_LENGTH
572
+
573
+ batch_outputs = {}
574
+ for i in range(batch_size):
575
+ inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
576
+ outputs = self._pad(
577
+ inputs,
578
+ max_length=max_length,
579
+ padding_strategy=padding_strategy,
580
+ pad_to_multiple_of=pad_to_multiple_of,
581
+ return_attention_mask=return_attention_mask,
582
+ )
583
+
584
+ # Handle decoder_input_ids
585
+ if self.model_input_names[2] in outputs:
586
+ max_decoder_length = max(len(inputs) for inputs in encoded_inputs[self.model_input_names[2]])
587
+ outputs = self._pad_decoder(
588
+ outputs,
589
+ max_length=max_decoder_length,
590
+ padding_strategy=padding_strategy,
591
+ pad_to_multiple_of=pad_to_multiple_of,
592
+ return_attention_mask=return_attention_mask,
593
+ )
594
+
595
+ for key, value in outputs.items():
596
+ if key not in batch_outputs:
597
+ batch_outputs[key] = []
598
+ batch_outputs[key].append(value)
599
+
600
+ return BatchEncoding(batch_outputs, tensor_type=return_tensors)