Update tokenization_chatglm.py
Browse filescompatibility with new transformers
- tokenization_chatglm.py +1 -64
tokenization_chatglm.py
CHANGED
@@ -262,67 +262,4 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
262 |
token_ids_0 = prefix_tokens + token_ids_0
|
263 |
if token_ids_1 is not None:
|
264 |
token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
|
265 |
-
return token_ids_0
|
266 |
-
|
267 |
-
def _pad(
|
268 |
-
self,
|
269 |
-
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
|
270 |
-
max_length: Optional[int] = None,
|
271 |
-
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
272 |
-
pad_to_multiple_of: Optional[int] = None,
|
273 |
-
return_attention_mask: Optional[bool] = None,
|
274 |
-
) -> dict:
|
275 |
-
"""
|
276 |
-
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
|
277 |
-
|
278 |
-
Args:
|
279 |
-
encoded_inputs:
|
280 |
-
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
|
281 |
-
max_length: maximum length of the returned list and optionally padding length (see below).
|
282 |
-
Will truncate by taking into account the special tokens.
|
283 |
-
padding_strategy: PaddingStrategy to use for padding.
|
284 |
-
|
285 |
-
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
286 |
-
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
287 |
-
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
288 |
-
The tokenizer padding sides are defined in self.padding_side:
|
289 |
-
|
290 |
-
- 'left': pads on the left of the sequences
|
291 |
-
- 'right': pads on the right of the sequences
|
292 |
-
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
293 |
-
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
294 |
-
`>= 7.5` (Volta).
|
295 |
-
return_attention_mask:
|
296 |
-
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
297 |
-
"""
|
298 |
-
# Load from model defaults
|
299 |
-
assert self.padding_side == "left"
|
300 |
-
|
301 |
-
required_input = encoded_inputs[self.model_input_names[0]]
|
302 |
-
seq_length = len(required_input)
|
303 |
-
|
304 |
-
if padding_strategy == PaddingStrategy.LONGEST:
|
305 |
-
max_length = len(required_input)
|
306 |
-
|
307 |
-
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
|
308 |
-
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
|
309 |
-
|
310 |
-
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
|
311 |
-
|
312 |
-
# Initialize attention mask if not present.
|
313 |
-
if "attention_mask" not in encoded_inputs:
|
314 |
-
encoded_inputs["attention_mask"] = [1] * seq_length
|
315 |
-
|
316 |
-
if "position_ids" not in encoded_inputs:
|
317 |
-
encoded_inputs["position_ids"] = list(range(seq_length))
|
318 |
-
|
319 |
-
if needs_to_be_padded:
|
320 |
-
difference = max_length - len(required_input)
|
321 |
-
|
322 |
-
if "attention_mask" in encoded_inputs:
|
323 |
-
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
324 |
-
if "position_ids" in encoded_inputs:
|
325 |
-
encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
|
326 |
-
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
327 |
-
|
328 |
-
return encoded_inputs
|
|
|
262 |
token_ids_0 = prefix_tokens + token_ids_0
|
263 |
if token_ids_1 is not None:
|
264 |
token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
|
265 |
+
return token_ids_0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|