p208p2002 commited on
Commit
e753ff0
·
verified ·
1 Parent(s): d99196f

Update tokenization_chatglm.py

Browse files

compatibility with new transformers

Files changed (1) hide show
  1. tokenization_chatglm.py +1 -64
tokenization_chatglm.py CHANGED
@@ -262,67 +262,4 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
262
  token_ids_0 = prefix_tokens + token_ids_0
263
  if token_ids_1 is not None:
264
  token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
265
- return token_ids_0
266
-
267
- def _pad(
268
- self,
269
- encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
270
- max_length: Optional[int] = None,
271
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
272
- pad_to_multiple_of: Optional[int] = None,
273
- return_attention_mask: Optional[bool] = None,
274
- ) -> dict:
275
- """
276
- Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
277
-
278
- Args:
279
- encoded_inputs:
280
- Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
281
- max_length: maximum length of the returned list and optionally padding length (see below).
282
- Will truncate by taking into account the special tokens.
283
- padding_strategy: PaddingStrategy to use for padding.
284
-
285
- - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
286
- - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
287
- - PaddingStrategy.DO_NOT_PAD: Do not pad
288
- The tokenizer padding sides are defined in self.padding_side:
289
-
290
- - 'left': pads on the left of the sequences
291
- - 'right': pads on the right of the sequences
292
- pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
293
- This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
294
- `>= 7.5` (Volta).
295
- return_attention_mask:
296
- (optional) Set to False to avoid returning attention mask (default: set to model specifics)
297
- """
298
- # Load from model defaults
299
- assert self.padding_side == "left"
300
-
301
- required_input = encoded_inputs[self.model_input_names[0]]
302
- seq_length = len(required_input)
303
-
304
- if padding_strategy == PaddingStrategy.LONGEST:
305
- max_length = len(required_input)
306
-
307
- if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
308
- max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
309
-
310
- needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
311
-
312
- # Initialize attention mask if not present.
313
- if "attention_mask" not in encoded_inputs:
314
- encoded_inputs["attention_mask"] = [1] * seq_length
315
-
316
- if "position_ids" not in encoded_inputs:
317
- encoded_inputs["position_ids"] = list(range(seq_length))
318
-
319
- if needs_to_be_padded:
320
- difference = max_length - len(required_input)
321
-
322
- if "attention_mask" in encoded_inputs:
323
- encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
324
- if "position_ids" in encoded_inputs:
325
- encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
326
- encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
327
-
328
- return encoded_inputs
 
262
  token_ids_0 = prefix_tokens + token_ids_0
263
  if token_ids_1 is not None:
264
  token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
265
+ return token_ids_0