abdiharyadi commited on
Commit
e18c38e
1 Parent(s): 632ca18

fix: update IndoNLGTokenizer

Browse files
Files changed (1) hide show
  1. indobenchmark.py +3 -261
indobenchmark.py CHANGED
@@ -14,21 +14,11 @@
14
  # limitations under the License
15
  """ Tokenization classes for IndoNLG model."""
16
 
17
- from typing import Dict, List, Optional, Tuple, Union
18
- from transformers import PreTrainedTokenizer, BatchEncoding
19
 
20
- from collections.abc import Mapping
21
- from transformers.utils import (
22
- PaddingStrategy,
23
- TensorType,
24
- is_tf_available,
25
- is_torch_available,
26
- logging,
27
- to_py_obj,
28
- )
29
- import numpy as np
30
  import sentencepiece as spm
31
- from transformers.utils.generic import _is_tensorflow, _is_torch
32
 
33
  logger = logging.get_logger(__name__)
34
 
@@ -350,251 +340,3 @@ class IndoNLGTokenizer(PreTrainedTokenizer):
350
  def decode(self, inputs, skip_special_tokens=False, **kwargs):
351
  outputs = super().decode(inputs, skip_special_tokens=skip_special_tokens, **kwargs)
352
  return outputs.replace(' ','').replace(SPIECE_UNDERLINE, ' ')
353
-
354
- def _pad_decoder(
355
- self,
356
- encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
357
- max_length: Optional[int] = None,
358
- padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
359
- pad_to_multiple_of: Optional[int] = None,
360
- return_attention_mask: Optional[bool] = None,
361
- ) -> dict:
362
- """
363
- Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
364
- Args:
365
- encoded_inputs:
366
- Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
367
- max_length: maximum length of the returned list and optionally padding length (see below).
368
- Will truncate by taking into account the special tokens.
369
- padding_strategy: PaddingStrategy to use for padding.
370
- - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
371
- - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
372
- - PaddingStrategy.DO_NOT_PAD: Do not pad
373
- The tokenizer padding sides are defined in self.padding_side:
374
- - 'left': pads on the left of the sequences
375
- - 'right': pads on the right of the sequences
376
- pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
377
- This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
378
- >= 7.5 (Volta).
379
- return_attention_mask:
380
- (optional) Set to False to avoid returning attention mask (default: set to model specifics)
381
- """
382
- # Load from model defaults
383
- if return_attention_mask is None:
384
- return_attention_mask = "decoder_attention_mask" in self.model_input_names
385
-
386
- required_input = encoded_inputs[self.model_input_names[2]]
387
-
388
- if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
389
- max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
390
-
391
- needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
392
-
393
- # Initialize attention mask if not present.
394
- if return_attention_mask and "decoder_attention_mask" not in encoded_inputs:
395
- encoded_inputs["decoder_attention_mask"] = [1] * len(required_input)
396
-
397
- if needs_to_be_padded:
398
- difference = max_length - len(required_input)
399
-
400
- if self.padding_side == "right":
401
- if return_attention_mask:
402
- encoded_inputs["decoder_attention_mask"] = encoded_inputs["decoder_attention_mask"] + [0] * difference
403
- if "decoder_token_type_ids" in encoded_inputs:
404
- encoded_inputs["decoder_token_type_ids"] = (
405
- encoded_inputs["decoder_token_type_ids"] + [self.pad_token_type_id] * difference
406
- )
407
- if "decoder_special_tokens_mask" in encoded_inputs:
408
- encoded_inputs["decoder_special_tokens_mask"] = encoded_inputs["decoder_special_tokens_mask"] + [1] * difference
409
- encoded_inputs[self.model_input_names[2]] = required_input + [self.pad_token_id] * difference
410
-
411
- label_input = encoded_inputs[self.model_input_names[4]]
412
- encoded_inputs[self.model_input_names[4]] = label_input + [-100] * difference
413
- elif self.padding_side == "left":
414
- if return_attention_mask:
415
- encoded_inputs["decoder_attention_mask"] = [0] * difference + encoded_inputs["decoder_attention_mask"]
416
- if "decoder_token_type_ids" in encoded_inputs:
417
- encoded_inputs["decoder_token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
418
- "decoder_token_type_ids"
419
- ]
420
- if "decoder_special_tokens_mask" in encoded_inputs:
421
- encoded_inputs["decoder_special_tokens_mask"] = [1] * difference + encoded_inputs["decoder_special_tokens_mask"]
422
- encoded_inputs[self.model_input_names[2]] = [self.pad_token_id] * difference + required_input
423
-
424
- label_input = encoded_inputs[self.model_input_names[4]]
425
- encoded_inputs[self.model_input_names[4]] = label_input + [-100] * difference
426
- else:
427
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
428
-
429
- return encoded_inputs
430
-
431
- def pad(self,
432
- encoded_inputs: Union[
433
- BatchEncoding,
434
- List[BatchEncoding],
435
- Dict[str, EncodedInput],
436
- Dict[str, List[EncodedInput]],
437
- List[Dict[str, EncodedInput]],
438
- ],
439
- padding: Union[bool, str, PaddingStrategy] = True,
440
- max_length: Optional[int] = None,
441
- pad_to_multiple_of: Optional[int] = None,
442
- return_attention_mask: Optional[bool] = None,
443
- return_tensors: Optional[Union[str, TensorType]] = None,
444
- verbose: bool = True,
445
- ) -> BatchEncoding:
446
- """
447
- Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
448
- in the batch.
449
-
450
- Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
451
- `self.pad_token_id` and `self.pad_token_type_id`)
452
-
453
- <Tip>
454
-
455
- If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
456
- result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
457
- PyTorch tensors, you will lose the specific device of your tensors however.
458
-
459
- </Tip>
460
-
461
- Args:
462
- encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
463
- Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
464
- tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
465
- List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
466
- collate function.
467
-
468
- Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see
469
- the note above for the return type.
470
- padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
471
- Select a strategy to pad the returned sequences (according to the model's padding side and padding
472
- index) among:
473
-
474
- - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
475
- sequence if provided).
476
- - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
477
- acceptable input length for the model if that argument is not provided.
478
- - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
479
- lengths).
480
- max_length (`int`, *optional*):
481
- Maximum length of the returned list and optionally padding length (see above).
482
- pad_to_multiple_of (`int`, *optional*):
483
- If set will pad the sequence to a multiple of the provided value.
484
-
485
- This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
486
- >= 7.5 (Volta).
487
- return_attention_mask (`bool`, *optional*):
488
- Whether to return the attention mask. If left to the default, will return the attention mask according
489
- to the specific tokenizer's default, defined by the `return_outputs` attribute.
490
-
491
- [What are attention masks?](../glossary#attention-mask)
492
- return_tensors (`str` or [`~utils.TensorType`], *optional*):
493
- If set, will return tensors instead of list of python integers. Acceptable values are:
494
-
495
- - `'tf'`: Return TensorFlow `tf.constant` objects.
496
- - `'pt'`: Return PyTorch `torch.Tensor` objects.
497
- - `'np'`: Return Numpy `np.ndarray` objects.
498
- verbose (`bool`, *optional*, defaults to `True`):
499
- Whether or not to print more information and warnings.
500
- """
501
- # If we have a list of dicts, let's convert it in a dict of lists
502
- # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
503
- if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
504
- encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
505
-
506
- # The model's main input name, usually `input_ids`, has be passed for padding
507
- if self.model_input_names[0] not in encoded_inputs:
508
- raise ValueError(
509
- "You should supply an encoding or a list of encodings to this method "
510
- f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
511
- )
512
-
513
- required_input = encoded_inputs[self.model_input_names[0]]
514
-
515
- if not required_input:
516
- if return_attention_mask:
517
- encoded_inputs["attention_mask"] = []
518
- return encoded_inputs
519
-
520
- # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
521
- # and rebuild them afterwards if no return_tensors is specified
522
- # Note that we lose the specific device the tensor may be on for PyTorch
523
-
524
- first_element = required_input[0]
525
- if isinstance(first_element, (list, tuple)):
526
- # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
527
- for item in required_input:
528
- if len(item) != 0:
529
- first_element = item[0]
530
- break
531
- # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
532
- if not isinstance(first_element, (int, list, tuple)):
533
- if is_tf_available() and _is_tensorflow(first_element):
534
- return_tensors = "tf" if return_tensors is None else return_tensors
535
- elif is_torch_available() and _is_torch(first_element):
536
- return_tensors = "pt" if return_tensors is None else return_tensors
537
- elif isinstance(first_element, np.ndarray):
538
- return_tensors = "np" if return_tensors is None else return_tensors
539
- else:
540
- raise ValueError(
541
- f"type of {first_element} unknown: {type(first_element)}. "
542
- f"Should be one of a python, numpy, pytorch or tensorflow object."
543
- )
544
-
545
- for key, value in encoded_inputs.items():
546
- encoded_inputs[key] = to_py_obj(value)
547
-
548
- # Convert padding_strategy in PaddingStrategy
549
- padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
550
- padding=padding, max_length=max_length, verbose=verbose
551
- )
552
-
553
- required_input = encoded_inputs[self.model_input_names[0]]
554
- if required_input and not isinstance(required_input[0], (list, tuple)):
555
- encoded_inputs = self._pad(
556
- encoded_inputs,
557
- max_length=max_length,
558
- padding_strategy=padding_strategy,
559
- pad_to_multiple_of=pad_to_multiple_of,
560
- return_attention_mask=return_attention_mask,
561
- )
562
- return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
563
-
564
- batch_size = len(required_input)
565
- assert all(
566
- len(v) == batch_size for v in encoded_inputs.values()
567
- ), "Some items in the output dictionary have a different batch size than others."
568
-
569
- if padding_strategy == PaddingStrategy.LONGEST:
570
- max_length = max(len(inputs) for inputs in required_input)
571
- padding_strategy = PaddingStrategy.MAX_LENGTH
572
-
573
- batch_outputs = {}
574
- for i in range(batch_size):
575
- inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
576
- outputs = self._pad(
577
- inputs,
578
- max_length=max_length,
579
- padding_strategy=padding_strategy,
580
- pad_to_multiple_of=pad_to_multiple_of,
581
- return_attention_mask=return_attention_mask,
582
- )
583
-
584
- # Handle decoder_input_ids
585
- if self.model_input_names[2] in outputs:
586
- max_decoder_length = max(len(inputs) for inputs in encoded_inputs[self.model_input_names[2]])
587
- outputs = self._pad_decoder(
588
- outputs,
589
- max_length=max_decoder_length,
590
- padding_strategy=padding_strategy,
591
- pad_to_multiple_of=pad_to_multiple_of,
592
- return_attention_mask=return_attention_mask,
593
- )
594
-
595
- for key, value in outputs.items():
596
- if key not in batch_outputs:
597
- batch_outputs[key] = []
598
- batch_outputs[key].append(value)
599
-
600
- return BatchEncoding(batch_outputs, tensor_type=return_tensors)
 
14
  # limitations under the License
15
  """ Tokenization classes for IndoNLG model."""
16
 
17
+ from typing import List, Optional, Tuple, Union
18
+ from transformers import PreTrainedTokenizer
19
 
20
+ from transformers.utils import logging
 
 
 
 
 
 
 
 
 
21
  import sentencepiece as spm
 
22
 
23
  logger = logging.get_logger(__name__)
24
 
 
340
  def decode(self, inputs, skip_special_tokens=False, **kwargs):
341
  outputs = super().decode(inputs, skip_special_tokens=skip_special_tokens, **kwargs)
342
  return outputs.replace(' ','').replace(SPIECE_UNDERLINE, ' ')