Spaces:

willwade
/

chaplinDemo

Runtime error

App Files Files Community

willwade commited on Feb 3

Commit

e2c1e0f

1 Parent(s): b817428

First push

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

configs/LRS3_V_WER19.1.ini +18 -0
espnet/.DS_Store +0 -0
espnet/asr/asr_utils.py +990 -0
espnet/nets/.DS_Store +0 -0
espnet/nets/batch_beam_search.py +349 -0
espnet/nets/beam_search.py +516 -0
espnet/nets/ctc_prefix_score.py +359 -0
espnet/nets/e2e_asr_common.py +249 -0
espnet/nets/lm_interface.py +86 -0
espnet/nets/pytorch_backend/backbones/conv1d_extractor.py +25 -0
espnet/nets/pytorch_backend/backbones/conv3d_extractor.py +47 -0
espnet/nets/pytorch_backend/backbones/modules/resnet.py +178 -0
espnet/nets/pytorch_backend/backbones/modules/resnet1d.py +213 -0
espnet/nets/pytorch_backend/backbones/modules/shufflenetv2.py +165 -0
espnet/nets/pytorch_backend/ctc.py +283 -0
espnet/nets/pytorch_backend/e2e_asr_transformer.py +320 -0
espnet/nets/pytorch_backend/e2e_asr_transformer_av.py +352 -0
espnet/nets/pytorch_backend/lm/__init__.py +1 -0
espnet/nets/pytorch_backend/lm/default.py +431 -0
espnet/nets/pytorch_backend/lm/seq_rnn.py +178 -0
espnet/nets/pytorch_backend/lm/transformer.py +252 -0
espnet/nets/pytorch_backend/nets_utils.py +526 -0
espnet/nets/pytorch_backend/transformer/__init__.py +1 -0
espnet/nets/pytorch_backend/transformer/add_sos_eos.py +31 -0
espnet/nets/pytorch_backend/transformer/attention.py +280 -0
espnet/nets/pytorch_backend/transformer/convolution.py +73 -0
espnet/nets/pytorch_backend/transformer/decoder.py +229 -0
espnet/nets/pytorch_backend/transformer/decoder_layer.py +121 -0
espnet/nets/pytorch_backend/transformer/embedding.py +217 -0
espnet/nets/pytorch_backend/transformer/encoder.py +283 -0
espnet/nets/pytorch_backend/transformer/encoder_layer.py +149 -0
espnet/nets/pytorch_backend/transformer/label_smoothing_loss.py +63 -0
espnet/nets/pytorch_backend/transformer/layer_norm.py +33 -0
espnet/nets/pytorch_backend/transformer/mask.py +51 -0
espnet/nets/pytorch_backend/transformer/multi_layer_conv.py +105 -0
espnet/nets/pytorch_backend/transformer/optimizer.py +75 -0
espnet/nets/pytorch_backend/transformer/plot.py +134 -0
espnet/nets/pytorch_backend/transformer/positionwise_feed_forward.py +30 -0
espnet/nets/pytorch_backend/transformer/raw_embeddings.py +77 -0
espnet/nets/pytorch_backend/transformer/repeat.py +30 -0
espnet/nets/pytorch_backend/transformer/subsampling.py +52 -0
espnet/nets/scorer_interface.py +188 -0
espnet/nets/scorers/__init__.py +1 -0
espnet/nets/scorers/ctc.py +158 -0
espnet/nets/scorers/length_bonus.py +61 -0
espnet/utils/cli_utils.py +65 -0
espnet/utils/dynamic_import.py +23 -0
espnet/utils/fill_missing_args.py +46 -0
pipelines/.DS_Store +0 -0
pipelines/data/.DS_Store +0 -0

configs/LRS3_V_WER19.1.ini ADDED Viewed

	@@ -0,0 +1,18 @@

+[input]
+modality=video
+v_fps=25
+[model]
+v_fps=25
+model_path=benchmarks/LRS3/models/LRS3_V_WER19.1/model.pth
+model_conf=benchmarks/LRS3/models/LRS3_V_WER19.1/model.json
+rnnlm=benchmarks/LRS3/language_models/lm_en_subword/model.pth
+rnnlm_conf=benchmarks/LRS3/language_models/lm_en_subword/model.json
+[decode]
+beam_size=40
+penalty=0.0
+maxlenratio=0.0
+minlenratio=0.0
+ctc_weight=0.1
+lm_weight=0.3

espnet/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

espnet/asr/asr_utils.py ADDED Viewed

	@@ -0,0 +1,990 @@

+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+import argparse
+import copy
+import json
+import logging
+import os
+import shutil
+import tempfile
+import numpy as np
+import torch
+# * -------------------- training iterator related -------------------- *
+class CompareValueTrigger(object):
+    """Trigger invoked when key value getting bigger or lower than before.
+    Args:
+        key (str) : Key of value.
+        compare_fn ((float, float) -> bool) : Function to compare the values.
+        trigger (tuple(int, str)) : Trigger that decide the comparison interval.
+    """
+    def __init__(self, key, compare_fn, trigger=(1, "epoch")):
+        from chainer import training
+        self._key = key
+        self._best_value = None
+        self._interval_trigger = training.util.get_trigger(trigger)
+        self._init_summary()
+        self._compare_fn = compare_fn
+    def __call__(self, trainer):
+        """Get value related to the key and compare with current value."""
+        observation = trainer.observation
+        summary = self._summary
+        key = self._key
+        if key in observation:
+            summary.add({key: observation[key]})
+        if not self._interval_trigger(trainer):
+            return False
+        stats = summary.compute_mean()
+        value = float(stats[key])  # copy to CPU
+        self._init_summary()
+        if self._best_value is None:
+            # initialize best value
+            self._best_value = value
+            return False
+        elif self._compare_fn(self._best_value, value):
+            return True
+        else:
+            self._best_value = value
+            return False
+    def _init_summary(self):
+        import chainer
+        self._summary = chainer.reporter.DictSummary()
+try:
+    from chainer.training import extension
+except ImportError:
+    PlotAttentionReport = None
+else:
+    class PlotAttentionReport(extension.Extension):
+        """Plot attention reporter.
+        Args:
+            att_vis_fn (espnet.nets.*_backend.e2e_asr.E2E.calculate_all_attentions):
+                Function of attention visualization.
+            data (list[tuple(str, dict[str, list[Any]])]): List json utt key items.
+            outdir (str): Directory to save figures.
+            converter (espnet.asr.*_backend.asr.CustomConverter):
+                Function to convert data.
+            device (int | torch.device): Device.
+            reverse (bool): If True, input and output length are reversed.
+            ikey (str): Key to access input
+                (for ASR/ST ikey="input", for MT ikey="output".)
+            iaxis (int): Dimension to access input
+                (for ASR/ST iaxis=0, for MT iaxis=1.)
+            okey (str): Key to access output
+                (for ASR/ST okey="input", MT okay="output".)
+            oaxis (int): Dimension to access output
+                (for ASR/ST oaxis=0, for MT oaxis=0.)
+            subsampling_factor (int): subsampling factor in encoder
+        """
+        def __init__(
+            self,
+            att_vis_fn,
+            data,
+            outdir,
+            converter,
+            transform,
+            device,
+            reverse=False,
+            ikey="input",
+            iaxis=0,
+            okey="output",
+            oaxis=0,
+            subsampling_factor=1,
+        ):
+            self.att_vis_fn = att_vis_fn
+            self.data = copy.deepcopy(data)
+            self.data_dict = {k: v for k, v in copy.deepcopy(data)}
+            # key is utterance ID
+            self.outdir = outdir
+            self.converter = converter
+            self.transform = transform
+            self.device = device
+            self.reverse = reverse
+            self.ikey = ikey
+            self.iaxis = iaxis
+            self.okey = okey
+            self.oaxis = oaxis
+            self.factor = subsampling_factor
+            if not os.path.exists(self.outdir):
+                os.makedirs(self.outdir)
+        def __call__(self, trainer):
+            """Plot and save image file of att_ws matrix."""
+            att_ws, uttid_list = self.get_attention_weights()
+            if isinstance(att_ws, list):  # multi-encoder case
+                num_encs = len(att_ws) - 1
+                # atts
+                for i in range(num_encs):
+                    for idx, att_w in enumerate(att_ws[i]):
+                        filename = "%s/%s.ep.{.updater.epoch}.att%d.png" % (
+                            self.outdir,
+                            uttid_list[idx],
+                            i + 1,
+                        )
+                        att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                        np_filename = "%s/%s.ep.{.updater.epoch}.att%d.npy" % (
+                            self.outdir,
+                            uttid_list[idx],
+                            i + 1,
+                        )
+                        np.save(np_filename.format(trainer), att_w)
+                        self._plot_and_save_attention(att_w, filename.format(trainer))
+                # han
+                for idx, att_w in enumerate(att_ws[num_encs]):
+                    filename = "%s/%s.ep.{.updater.epoch}.han.png" % (
+                        self.outdir,
+                        uttid_list[idx],
+                    )
+                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                    np_filename = "%s/%s.ep.{.updater.epoch}.han.npy" % (
+                        self.outdir,
+                        uttid_list[idx],
+                    )
+                    np.save(np_filename.format(trainer), att_w)
+                    self._plot_and_save_attention(
+                        att_w, filename.format(trainer), han_mode=True
+                    )
+            else:
+                for idx, att_w in enumerate(att_ws):
+                    filename = "%s/%s.ep.{.updater.epoch}.png" % (
+                        self.outdir,
+                        uttid_list[idx],
+                    )
+                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                    np_filename = "%s/%s.ep.{.updater.epoch}.npy" % (
+                        self.outdir,
+                        uttid_list[idx],
+                    )
+                    np.save(np_filename.format(trainer), att_w)
+                    self._plot_and_save_attention(att_w, filename.format(trainer))
+        def log_attentions(self, logger, step):
+            """Add image files of att_ws matrix to the tensorboard."""
+            att_ws, uttid_list = self.get_attention_weights()
+            if isinstance(att_ws, list):  # multi-encoder case
+                num_encs = len(att_ws) - 1
+                # atts
+                for i in range(num_encs):
+                    for idx, att_w in enumerate(att_ws[i]):
+                        att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                        plot = self.draw_attention_plot(att_w)
+                        logger.add_figure(
+                            "%s_att%d" % (uttid_list[idx], i + 1),
+                            plot.gcf(),
+                            step,
+                        )
+                # han
+                for idx, att_w in enumerate(att_ws[num_encs]):
+                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                    plot = self.draw_han_plot(att_w)
+                    logger.add_figure(
+                        "%s_han" % (uttid_list[idx]),
+                        plot.gcf(),
+                        step,
+                    )
+            else:
+                for idx, att_w in enumerate(att_ws):
+                    att_w = self.trim_attention_weight(uttid_list[idx], att_w)
+                    plot = self.draw_attention_plot(att_w)
+                    logger.add_figure("%s" % (uttid_list[idx]), plot.gcf(), step)
+        def get_attention_weights(self):
+            """Return attention weights.
+            Returns:
+                numpy.ndarray: attention weights. float. Its shape would be
+                    differ from backend.
+                    * pytorch-> 1) multi-head case => (B, H, Lmax, Tmax), 2)
+                      other case => (B, Lmax, Tmax).
+                    * chainer-> (B, Lmax, Tmax)
+            """
+            return_batch, uttid_list = self.transform(self.data, return_uttid=True)
+            batch = self.converter([return_batch], self.device)
+            if isinstance(batch, tuple):
+                att_ws = self.att_vis_fn(*batch)
+            else:
+                att_ws = self.att_vis_fn(**batch)
+            return att_ws, uttid_list
+        def trim_attention_weight(self, uttid, att_w):
+            """Transform attention matrix with regard to self.reverse."""
+            if self.reverse:
+                enc_key, enc_axis = self.okey, self.oaxis
+                dec_key, dec_axis = self.ikey, self.iaxis
+            else:
+                enc_key, enc_axis = self.ikey, self.iaxis
+                dec_key, dec_axis = self.okey, self.oaxis
+            dec_len = int(self.data_dict[uttid][dec_key][dec_axis]["shape"][0])
+            enc_len = int(self.data_dict[uttid][enc_key][enc_axis]["shape"][0])
+            if self.factor > 1:
+                enc_len //= self.factor
+            if len(att_w.shape) == 3:
+                att_w = att_w[:, :dec_len, :enc_len]
+            else:
+                att_w = att_w[:dec_len, :enc_len]
+            return att_w
+        def draw_attention_plot(self, att_w):
+            """Plot the att_w matrix.
+            Returns:
+                matplotlib.pyplot: pyplot object with attention matrix image.
+            """
+            import matplotlib
+            matplotlib.use("Agg")
+            import matplotlib.pyplot as plt
+            plt.clf()
+            att_w = att_w.astype(np.float32)
+            if len(att_w.shape) == 3:
+                for h, aw in enumerate(att_w, 1):
+                    plt.subplot(1, len(att_w), h)
+                    plt.imshow(aw, aspect="auto")
+                    plt.xlabel("Encoder Index")
+                    plt.ylabel("Decoder Index")
+            else:
+                plt.imshow(att_w, aspect="auto")
+                plt.xlabel("Encoder Index")
+                plt.ylabel("Decoder Index")
+            plt.tight_layout()
+            return plt
+        def draw_han_plot(self, att_w):
+            """Plot the att_w matrix for hierarchical attention.
+            Returns:
+                matplotlib.pyplot: pyplot object with attention matrix image.
+            """
+            import matplotlib
+            matplotlib.use("Agg")
+            import matplotlib.pyplot as plt
+            plt.clf()
+            if len(att_w.shape) == 3:
+                for h, aw in enumerate(att_w, 1):
+                    legends = []
+                    plt.subplot(1, len(att_w), h)
+                    for i in range(aw.shape[1]):
+                        plt.plot(aw[:, i])
+                        legends.append("Att{}".format(i))
+                    plt.ylim([0, 1.0])
+                    plt.xlim([0, aw.shape[0]])
+                    plt.grid(True)
+                    plt.ylabel("Attention Weight")
+                    plt.xlabel("Decoder Index")
+                    plt.legend(legends)
+            else:
+                legends = []
+                for i in range(att_w.shape[1]):
+                    plt.plot(att_w[:, i])
+                    legends.append("Att{}".format(i))
+                plt.ylim([0, 1.0])
+                plt.xlim([0, att_w.shape[0]])
+                plt.grid(True)
+                plt.ylabel("Attention Weight")
+                plt.xlabel("Decoder Index")
+                plt.legend(legends)
+            plt.tight_layout()
+            return plt
+        def _plot_and_save_attention(self, att_w, filename, han_mode=False):
+            if han_mode:
+                plt = self.draw_han_plot(att_w)
+            else:
+                plt = self.draw_attention_plot(att_w)
+            plt.savefig(filename)
+            plt.close()
+try:
+    from chainer.training import extension
+except ImportError:
+    PlotCTCReport = None
+else:
+    class PlotCTCReport(extension.Extension):
+        """Plot CTC reporter.
+        Args:
+            ctc_vis_fn (espnet.nets.*_backend.e2e_asr.E2E.calculate_all_ctc_probs):
+                Function of CTC visualization.
+            data (list[tuple(str, dict[str, list[Any]])]): List json utt key items.
+            outdir (str): Directory to save figures.
+            converter (espnet.asr.*_backend.asr.CustomConverter):
+                Function to convert data.
+            device (int | torch.device): Device.
+            reverse (bool): If True, input and output length are reversed.
+            ikey (str): Key to access input
+                (for ASR/ST ikey="input", for MT ikey="output".)
+            iaxis (int): Dimension to access input
+                (for ASR/ST iaxis=0, for MT iaxis=1.)
+            okey (str): Key to access output
+                (for ASR/ST okey="input", MT okay="output".)
+            oaxis (int): Dimension to access output
+                (for ASR/ST oaxis=0, for MT oaxis=0.)
+            subsampling_factor (int): subsampling factor in encoder
+        """
+        def __init__(
+            self,
+            ctc_vis_fn,
+            data,
+            outdir,
+            converter,
+            transform,
+            device,
+            reverse=False,
+            ikey="input",
+            iaxis=0,
+            okey="output",
+            oaxis=0,
+            subsampling_factor=1,
+        ):
+            self.ctc_vis_fn = ctc_vis_fn
+            self.data = copy.deepcopy(data)
+            self.data_dict = {k: v for k, v in copy.deepcopy(data)}
+            # key is utterance ID
+            self.outdir = outdir
+            self.converter = converter
+            self.transform = transform
+            self.device = device
+            self.reverse = reverse
+            self.ikey = ikey
+            self.iaxis = iaxis
+            self.okey = okey
+            self.oaxis = oaxis
+            self.factor = subsampling_factor
+            if not os.path.exists(self.outdir):
+                os.makedirs(self.outdir)
+        def __call__(self, trainer):
+            """Plot and save image file of ctc prob."""
+            ctc_probs, uttid_list = self.get_ctc_probs()
+            if isinstance(ctc_probs, list):  # multi-encoder case
+                num_encs = len(ctc_probs) - 1
+                for i in range(num_encs):
+                    for idx, ctc_prob in enumerate(ctc_probs[i]):
+                        filename = "%s/%s.ep.{.updater.epoch}.ctc%d.png" % (
+                            self.outdir,
+                            uttid_list[idx],
+                            i + 1,
+                        )
+                        ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
+                        np_filename = "%s/%s.ep.{.updater.epoch}.ctc%d.npy" % (
+                            self.outdir,
+                            uttid_list[idx],
+                            i + 1,
+                        )
+                        np.save(np_filename.format(trainer), ctc_prob)
+                        self._plot_and_save_ctc(ctc_prob, filename.format(trainer))
+            else:
+                for idx, ctc_prob in enumerate(ctc_probs):
+                    filename = "%s/%s.ep.{.updater.epoch}.png" % (
+                        self.outdir,
+                        uttid_list[idx],
+                    )
+                    ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
+                    np_filename = "%s/%s.ep.{.updater.epoch}.npy" % (
+                        self.outdir,
+                        uttid_list[idx],
+                    )
+                    np.save(np_filename.format(trainer), ctc_prob)
+                    self._plot_and_save_ctc(ctc_prob, filename.format(trainer))
+        def log_ctc_probs(self, logger, step):
+            """Add image files of ctc probs to the tensorboard."""
+            ctc_probs, uttid_list = self.get_ctc_probs()
+            if isinstance(ctc_probs, list):  # multi-encoder case
+                num_encs = len(ctc_probs) - 1
+                for i in range(num_encs):
+                    for idx, ctc_prob in enumerate(ctc_probs[i]):
+                        ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
+                        plot = self.draw_ctc_plot(ctc_prob)
+                        logger.add_figure(
+                            "%s_ctc%d" % (uttid_list[idx], i + 1),
+                            plot.gcf(),
+                            step,
+                        )
+            else:
+                for idx, ctc_prob in enumerate(ctc_probs):
+                    ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob)
+                    plot = self.draw_ctc_plot(ctc_prob)
+                    logger.add_figure("%s" % (uttid_list[idx]), plot.gcf(), step)
+        def get_ctc_probs(self):
+            """Return CTC probs.
+            Returns:
+                numpy.ndarray: CTC probs. float. Its shape would be
+                    differ from backend. (B, Tmax, vocab).
+            """
+            return_batch, uttid_list = self.transform(self.data, return_uttid=True)
+            batch = self.converter([return_batch], self.device)
+            if isinstance(batch, tuple):
+                probs = self.ctc_vis_fn(*batch)
+            else:
+                probs = self.ctc_vis_fn(**batch)
+            return probs, uttid_list
+        def trim_ctc_prob(self, uttid, prob):
+            """Trim CTC posteriors accoding to input lengths."""
+            enc_len = int(self.data_dict[uttid][self.ikey][self.iaxis]["shape"][0])
+            if self.factor > 1:
+                enc_len //= self.factor
+            prob = prob[:enc_len]
+            return prob
+        def draw_ctc_plot(self, ctc_prob):
+            """Plot the ctc_prob matrix.
+            Returns:
+                matplotlib.pyplot: pyplot object with CTC prob matrix image.
+            """
+            import matplotlib
+            matplotlib.use("Agg")
+            import matplotlib.pyplot as plt
+            ctc_prob = ctc_prob.astype(np.float32)
+            plt.clf()
+            topk_ids = np.argsort(ctc_prob, axis=1)
+            n_frames, vocab = ctc_prob.shape
+            times_probs = np.arange(n_frames)
+            plt.figure(figsize=(20, 8))
+            # NOTE: index 0 is reserved for blank
+            for idx in set(topk_ids.reshape(-1).tolist()):
+                if idx == 0:
+                    plt.plot(
+                        times_probs, ctc_prob[:, 0], ":", label="<blank>", color="grey"
+                    )
+                else:
+                    plt.plot(times_probs, ctc_prob[:, idx])
+            plt.xlabel("Input [frame]", fontsize=12)
+            plt.ylabel("Posteriors", fontsize=12)
+            plt.xticks(list(range(0, int(n_frames) + 1, 10)))
+            plt.yticks(list(range(0, 2, 1)))
+            plt.tight_layout()
+            return plt
+        def _plot_and_save_ctc(self, ctc_prob, filename):
+            plt = self.draw_ctc_plot(ctc_prob)
+            plt.savefig(filename)
+            plt.close()
+def restore_snapshot(model, snapshot, load_fn=None):
+    """Extension to restore snapshot.
+    Returns:
+        An extension function.
+    """
+    import chainer
+    from chainer import training
+    if load_fn is None:
+        load_fn = chainer.serializers.load_npz
+    @training.make_extension(trigger=(1, "epoch"))
+    def restore_snapshot(trainer):
+        _restore_snapshot(model, snapshot, load_fn)
+    return restore_snapshot
+def _restore_snapshot(model, snapshot, load_fn=None):
+    if load_fn is None:
+        import chainer
+        load_fn = chainer.serializers.load_npz
+    load_fn(snapshot, model)
+    logging.info("restored from " + str(snapshot))
+def adadelta_eps_decay(eps_decay):
+    """Extension to perform adadelta eps decay.
+    Args:
+        eps_decay (float): Decay rate of eps.
+    Returns:
+        An extension function.
+    """
+    from chainer import training
+    @training.make_extension(trigger=(1, "epoch"))
+    def adadelta_eps_decay(trainer):
+        _adadelta_eps_decay(trainer, eps_decay)
+    return adadelta_eps_decay
+def _adadelta_eps_decay(trainer, eps_decay):
+    optimizer = trainer.updater.get_optimizer("main")
+    # for chainer
+    if hasattr(optimizer, "eps"):
+        current_eps = optimizer.eps
+        setattr(optimizer, "eps", current_eps * eps_decay)
+        logging.info("adadelta eps decayed to " + str(optimizer.eps))
+    # pytorch
+    else:
+        for p in optimizer.param_groups:
+            p["eps"] *= eps_decay
+            logging.info("adadelta eps decayed to " + str(p["eps"]))
+def adam_lr_decay(eps_decay):
+    """Extension to perform adam lr decay.
+    Args:
+        eps_decay (float): Decay rate of lr.
+    Returns:
+        An extension function.
+    """
+    from chainer import training
+    @training.make_extension(trigger=(1, "epoch"))
+    def adam_lr_decay(trainer):
+        _adam_lr_decay(trainer, eps_decay)
+    return adam_lr_decay
+def _adam_lr_decay(trainer, eps_decay):
+    optimizer = trainer.updater.get_optimizer("main")
+    # for chainer
+    if hasattr(optimizer, "lr"):
+        current_lr = optimizer.lr
+        setattr(optimizer, "lr", current_lr * eps_decay)
+        logging.info("adam lr decayed to " + str(optimizer.lr))
+    # pytorch
+    else:
+        for p in optimizer.param_groups:
+            p["lr"] *= eps_decay
+            logging.info("adam lr decayed to " + str(p["lr"]))
+def torch_snapshot(savefun=torch.save, filename="snapshot.ep.{.updater.epoch}"):
+    """Extension to take snapshot of the trainer for pytorch.
+    Returns:
+        An extension function.
+    """
+    from chainer.training import extension
+    @extension.make_extension(trigger=(1, "epoch"), priority=-100)
+    def torch_snapshot(trainer):
+        _torch_snapshot_object(trainer, trainer, filename.format(trainer), savefun)
+    return torch_snapshot
+def _torch_snapshot_object(trainer, target, filename, savefun):
+    from chainer.serializers import DictionarySerializer
+    # make snapshot_dict dictionary
+    s = DictionarySerializer()
+    s.save(trainer)
+    if hasattr(trainer.updater.model, "model"):
+        # (for TTS)
+        if hasattr(trainer.updater.model.model, "module"):
+            model_state_dict = trainer.updater.model.model.module.state_dict()
+        else:
+            model_state_dict = trainer.updater.model.model.state_dict()
+    else:
+        # (for ASR)
+        if hasattr(trainer.updater.model, "module"):
+            model_state_dict = trainer.updater.model.module.state_dict()
+        else:
+            model_state_dict = trainer.updater.model.state_dict()
+    snapshot_dict = {
+        "trainer": s.target,
+        "model": model_state_dict,
+        "optimizer": trainer.updater.get_optimizer("main").state_dict(),
+    }
+    # save snapshot dictionary
+    fn = filename.format(trainer)
+    prefix = "tmp" + fn
+    tmpdir = tempfile.mkdtemp(prefix=prefix, dir=trainer.out)
+    tmppath = os.path.join(tmpdir, fn)
+    try:
+        savefun(snapshot_dict, tmppath)
+        shutil.move(tmppath, os.path.join(trainer.out, fn))
+    finally:
+        shutil.rmtree(tmpdir)
+def add_gradient_noise(model, iteration, duration=100, eta=1.0, scale_factor=0.55):
+    """Adds noise from a standard normal distribution to the gradients.
+    The standard deviation (`sigma`) is controlled by the three hyper-parameters below.
+    `sigma` goes to zero (no noise) with more iterations.
+    Args:
+        model (torch.nn.model): Model.
+        iteration (int): Number of iterations.
+        duration (int) {100, 1000}:
+            Number of durations to control the interval of the `sigma` change.
+        eta (float) {0.01, 0.3, 1.0}: The magnitude of `sigma`.
+        scale_factor (float) {0.55}: The scale of `sigma`.
+    """
+    interval = (iteration // duration) + 1
+    sigma = eta / interval**scale_factor
+    for param in model.parameters():
+        if param.grad is not None:
+            _shape = param.grad.size()
+            noise = sigma * torch.randn(_shape).to(param.device)
+            param.grad += noise
+# * -------------------- general -------------------- *
+def get_model_conf(model_path, conf_path=None):
+    """Get model config information by reading a model config file (model.json).
+    Args:
+        model_path (str): Model path.
+        conf_path (str): Optional model config path.
+    Returns:
+        list[int, int, dict[str, Any]]: Config information loaded from json file.
+    """
+    if conf_path is None:
+        model_conf = os.path.dirname(model_path) + "/model.json"
+    else:
+        model_conf = conf_path
+    with open(model_conf, "rb") as f:
+        logging.info("reading a config file from " + model_conf)
+        confs = json.load(f)
+    if isinstance(confs, dict):
+        # for lm
+        args = confs
+        return argparse.Namespace(**args)
+    else:
+        # for asr, tts, mt
+        idim, odim, args = confs
+        return idim, odim, argparse.Namespace(**args)
+def chainer_load(path, model):
+    """Load chainer model parameters.
+    Args:
+        path (str): Model path or snapshot file path to be loaded.
+        model (chainer.Chain): Chainer model.
+    """
+    import chainer
+    if "snapshot" in os.path.basename(path):
+        chainer.serializers.load_npz(path, model, path="updater/model:main/")
+    else:
+        chainer.serializers.load_npz(path, model)
+def torch_save(path, model):
+    """Save torch model states.
+    Args:
+        path (str): Model path to be saved.
+        model (torch.nn.Module): Torch model.
+    """
+    if hasattr(model, "module"):
+        torch.save(model.module.state_dict(), path)
+    else:
+        torch.save(model.state_dict(), path)
+def snapshot_object(target, filename):
+    """Returns a trainer extension to take snapshots of a given object.
+    Args:
+        target (model): Object to serialize.
+        filename (str): Name of the file into which the object is serialized.It can
+            be a format string, where the trainer object is passed to
+            the :meth: `str.format` method. For example,
+            ``'snapshot_{.updater.iteration}'`` is converted to
+            ``'snapshot_10000'`` at the 10,000th iteration.
+    Returns:
+        An extension function.
+    """
+    from chainer.training import extension
+    @extension.make_extension(trigger=(1, "epoch"), priority=-100)
+    def snapshot_object(trainer):
+        torch_save(os.path.join(trainer.out, filename.format(trainer)), target)
+    return snapshot_object
+def torch_load(path, model):
+    """Load torch model states.
+    Args:
+        path (str): Model path or snapshot file path to be loaded.
+        model (torch.nn.Module): Torch model.
+    """
+    if "snapshot" in os.path.basename(path):
+        model_state_dict = torch.load(path, map_location=lambda storage, loc: storage)[
+            "model"
+        ]
+    else:
+        model_state_dict = torch.load(path, map_location=lambda storage, loc: storage)
+    if hasattr(model, "module"):
+        model.module.load_state_dict(model_state_dict)
+    else:
+        model.load_state_dict(model_state_dict)
+    del model_state_dict
+def torch_resume(snapshot_path, trainer):
+    """Resume from snapshot for pytorch.
+    Args:
+        snapshot_path (str): Snapshot file path.
+        trainer (chainer.training.Trainer): Chainer's trainer instance.
+    """
+    from chainer.serializers import NpzDeserializer
+    # load snapshot
+    snapshot_dict = torch.load(snapshot_path, map_location=lambda storage, loc: storage)
+    # restore trainer states
+    d = NpzDeserializer(snapshot_dict["trainer"])
+    d.load(trainer)
+    # restore model states
+    if hasattr(trainer.updater.model, "model"):
+        # (for TTS model)
+        if hasattr(trainer.updater.model.model, "module"):
+            trainer.updater.model.model.module.load_state_dict(snapshot_dict["model"])
+        else:
+            trainer.updater.model.model.load_state_dict(snapshot_dict["model"])
+    else:
+        # (for ASR model)
+        if hasattr(trainer.updater.model, "module"):
+            trainer.updater.model.module.load_state_dict(snapshot_dict["model"])
+        else:
+            trainer.updater.model.load_state_dict(snapshot_dict["model"])
+    # retore optimizer states
+    trainer.updater.get_optimizer("main").load_state_dict(snapshot_dict["optimizer"])
+    # delete opened snapshot
+    del snapshot_dict
+# * ------------------ recognition related ------------------ *
+def parse_hypothesis(hyp, char_list):
+    """Parse hypothesis.
+    Args:
+        hyp (list[dict[str, Any]]): Recognition hypothesis.
+        char_list (list[str]): List of characters.
+    Returns:
+        tuple(str, str, str, float)
+    """
+    # remove sos and get results
+    tokenid_as_list = list(map(int, hyp["yseq"][1:]))
+    token_as_list = [char_list[idx] for idx in tokenid_as_list]
+    score = float(hyp["score"])
+    # convert to string
+    tokenid = " ".join([str(idx) for idx in tokenid_as_list])
+    token = " ".join(token_as_list)
+    text = "".join(token_as_list).replace("<space>", " ")
+    return text, token, tokenid, score
+def add_results_to_json(nbest_hyps, char_list):
+    """Add N-best results to json.
+    Args:
+        js (dict[str, Any]): Groundtruth utterance dict.
+        nbest_hyps_sd (list[dict[str, Any]]):
+            List of hypothesis for multi_speakers: nutts x nspkrs.
+        char_list (list[str]): List of characters.
+    Returns:
+        str: 1-best result
+    """
+    assert len(nbest_hyps) == 1, "only 1-best result is supported."
+    # parse hypothesis
+    rec_text, rec_token, rec_tokenid, score = parse_hypothesis(nbest_hyps[0], char_list)
+    return rec_text
+def plot_spectrogram(
+    plt,
+    spec,
+    mode="db",
+    fs=None,
+    frame_shift=None,
+    bottom=True,
+    left=True,
+    right=True,
+    top=False,
+    labelbottom=True,
+    labelleft=True,
+    labelright=True,
+    labeltop=False,
+    cmap="inferno",
+):
+    """Plot spectrogram using matplotlib.
+    Args:
+        plt (matplotlib.pyplot): pyplot object.
+        spec (numpy.ndarray): Input stft (Freq, Time)
+        mode (str): db or linear.
+        fs (int): Sample frequency. To convert y-axis to kHz unit.
+        frame_shift (int): The frame shift of stft. To convert x-axis to second unit.
+        bottom (bool):Whether to draw the respective ticks.
+        left (bool):
+        right (bool):
+        top (bool):
+        labelbottom (bool):Whether to draw the respective tick labels.
+        labelleft (bool):
+        labelright (bool):
+        labeltop (bool):
+        cmap (str): Colormap defined in matplotlib.
+    """
+    spec = np.abs(spec)
+    if mode == "db":
+        x = 20 * np.log10(spec + np.finfo(spec.dtype).eps)
+    elif mode == "linear":
+        x = spec
+    else:
+        raise ValueError(mode)
+    if fs is not None:
+        ytop = fs / 2000
+        ylabel = "kHz"
+    else:
+        ytop = x.shape[0]
+        ylabel = "bin"
+    if frame_shift is not None and fs is not None:
+        xtop = x.shape[1] * frame_shift / fs
+        xlabel = "s"
+    else:
+        xtop = x.shape[1]
+        xlabel = "frame"
+    extent = (0, xtop, 0, ytop)
+    plt.imshow(x[::-1], cmap=cmap, extent=extent)
+    if labelbottom:
+        plt.xlabel("time [{}]".format(xlabel))
+    if labelleft:
+        plt.ylabel("freq [{}]".format(ylabel))
+    plt.colorbar().set_label("{}".format(mode))
+    plt.tick_params(
+        bottom=bottom,
+        left=left,
+        right=right,
+        top=top,
+        labelbottom=labelbottom,
+        labelleft=labelleft,
+        labelright=labelright,
+        labeltop=labeltop,
+    )
+    plt.axis("auto")
+# * ------------------ recognition related ------------------ *
+def format_mulenc_args(args):
+    """Format args for multi-encoder setup.
+    It deals with following situations:  (when args.num_encs=2):
+    1. args.elayers = None -> args.elayers = [4, 4];
+    2. args.elayers = 4 -> args.elayers = [4, 4];
+    3. args.elayers = [4, 4, 4] -> args.elayers = [4, 4].
+    """
+    # default values when None is assigned.
+    default_dict = {
+        "etype": "blstmp",
+        "elayers": 4,
+        "eunits": 300,
+        "subsample": "1",
+        "dropout_rate": 0.0,
+        "atype": "dot",
+        "adim": 320,
+        "awin": 5,
+        "aheads": 4,
+        "aconv_chans": -1,
+        "aconv_filts": 100,
+    }
+    for k in default_dict.keys():
+        if isinstance(vars(args)[k], list):
+            if len(vars(args)[k]) != args.num_encs:
+                logging.warning(
+                    "Length mismatch {}: Convert {} to {}.".format(
+                        k, vars(args)[k], vars(args)[k][: args.num_encs]
+                    )
+                )
+            vars(args)[k] = vars(args)[k][: args.num_encs]
+        else:
+            if not vars(args)[k]:
+                # assign default value if it is None
+                vars(args)[k] = default_dict[k]
+                logging.warning(
+                    "{} is not specified, use default value {}.".format(
+                        k, default_dict[k]
+                    )
+                )
+            # duplicate
+            logging.warning(
+                "Type mismatch {}: Convert {} to {}.".format(
+                    k, vars(args)[k], [vars(args)[k] for _ in range(args.num_encs)]
+                )
+            )
+            vars(args)[k] = [vars(args)[k] for _ in range(args.num_encs)]
+    return args

espnet/nets/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

espnet/nets/batch_beam_search.py ADDED Viewed

	@@ -0,0 +1,349 @@

+"""Parallel beam search module."""
+import logging
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import NamedTuple
+from typing import Tuple
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from espnet.nets.beam_search import BeamSearch
+from espnet.nets.beam_search import Hypothesis
+class BatchHypothesis(NamedTuple):
+    """Batchfied/Vectorized hypothesis data type."""
+    yseq: torch.Tensor = torch.tensor([])  # (batch, maxlen)
+    score: torch.Tensor = torch.tensor([])  # (batch,)
+    length: torch.Tensor = torch.tensor([])  # (batch,)
+    scores: Dict[str, torch.Tensor] = dict()  # values: (batch,)
+    states: Dict[str, Dict] = dict()
+    def __len__(self) -> int:
+        """Return a batch size."""
+        return len(self.length)
+class BatchBeamSearch(BeamSearch):
+    """Batch beam search implementation."""
+    def batchfy(self, hyps: List[Hypothesis]) -> BatchHypothesis:
+        """Convert list to batch."""
+        if len(hyps) == 0:
+            return BatchHypothesis()
+        yseq=pad_sequence(
+            [h.yseq for h in hyps], batch_first=True, padding_value=self.eos
+        )
+        return BatchHypothesis(
+            yseq=yseq,
+            length=torch.tensor([len(h.yseq) for h in hyps], dtype=torch.int64, device=yseq.device),
+            score=torch.tensor([h.score for h in hyps]).to(yseq.device),
+            scores={k: torch.tensor([h.scores[k] for h in hyps], device=yseq.device) for k in self.scorers},
+            states={k: [h.states[k] for h in hyps] for k in self.scorers},
+        )
+    def _batch_select(self, hyps: BatchHypothesis, ids: List[int]) -> BatchHypothesis:
+        return BatchHypothesis(
+            yseq=hyps.yseq[ids],
+            score=hyps.score[ids],
+            length=hyps.length[ids],
+            scores={k: v[ids] for k, v in hyps.scores.items()},
+            states={
+                k: [self.scorers[k].select_state(v, i) for i in ids]
+                for k, v in hyps.states.items()
+            },
+        )
+    def _select(self, hyps: BatchHypothesis, i: int) -> Hypothesis:
+        return Hypothesis(
+            yseq=hyps.yseq[i, : hyps.length[i]],
+            score=hyps.score[i],
+            scores={k: v[i] for k, v in hyps.scores.items()},
+            states={
+                k: self.scorers[k].select_state(v, i) for k, v in hyps.states.items()
+            },
+        )
+    def unbatchfy(self, batch_hyps: BatchHypothesis) -> List[Hypothesis]:
+        """Revert batch to list."""
+        return [
+            Hypothesis(
+                yseq=batch_hyps.yseq[i][: batch_hyps.length[i]],
+                score=batch_hyps.score[i],
+                scores={k: batch_hyps.scores[k][i] for k in self.scorers},
+                states={
+                    k: v.select_state(batch_hyps.states[k], i)
+                    for k, v in self.scorers.items()
+                },
+            )
+            for i in range(len(batch_hyps.length))
+        ]
+    def batch_beam(
+        self, weighted_scores: torch.Tensor, ids: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Batch-compute topk full token ids and partial token ids.
+        Args:
+            weighted_scores (torch.Tensor): The weighted sum scores for each tokens.
+                Its shape is `(n_beam, self.vocab_size)`.
+            ids (torch.Tensor): The partial token ids to compute topk.
+                Its shape is `(n_beam, self.pre_beam_size)`.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+                The topk full (prev_hyp, new_token) ids
+                and partial (prev_hyp, new_token) ids.
+                Their shapes are all `(self.beam_size,)`
+        """
+        top_ids = weighted_scores.view(-1).topk(self.beam_size)[1]
+        # Because of the flatten above, `top_ids` is organized as:
+        # [hyp1 * V + token1, hyp2 * V + token2, ..., hypK * V + tokenK],
+        # where V is `self.n_vocab` and K is `self.beam_size`
+        prev_hyp_ids = torch.div(top_ids, self.n_vocab, rounding_mode='trunc')
+        new_token_ids = top_ids % self.n_vocab
+        return prev_hyp_ids, new_token_ids, prev_hyp_ids, new_token_ids
+    def init_hyp(self, x: torch.Tensor) -> BatchHypothesis:
+        """Get an initial hypothesis data.
+        Args:
+            x (torch.Tensor): The encoder output feature
+        Returns:
+            Hypothesis: The initial hypothesis.
+        """
+        init_states = dict()
+        init_scores = dict()
+        for k, d in self.scorers.items():
+            init_states[k] = d.batch_init_state(x)
+            init_scores[k] = 0.0
+        return self.batchfy(
+            [
+                Hypothesis(
+                    score=0.0,
+                    scores=init_scores,
+                    states=init_states,
+                    yseq=torch.tensor([self.sos], device=x.device),
+                )
+            ]
+        )
+    def score_full(
+        self, hyp: BatchHypothesis, x: torch.Tensor
+    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.full_scorers`.
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            x (torch.Tensor): Corresponding input feature
+        Returns:
+            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.full_scorers`
+                and tensor score values of shape: `(self.n_vocab,)`,
+                and state dict that has string keys
+                and state values of `self.full_scorers`
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.full_scorers.items():
+            scores[k], states[k] = d.batch_score(hyp.yseq, hyp.states[k], x)
+        return scores, states
+    def score_partial(
+        self, hyp: BatchHypothesis, ids: torch.Tensor, x: torch.Tensor
+    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.full_scorers`.
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            ids (torch.Tensor): 2D tensor of new partial tokens to score
+            x (torch.Tensor): Corresponding input feature
+        Returns:
+            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.full_scorers`
+                and tensor score values of shape: `(self.n_vocab,)`,
+                and state dict that has string keys
+                and state values of `self.full_scorers`
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.part_scorers.items():
+            scores[k], states[k] = d.batch_score_partial(
+                hyp.yseq, ids, hyp.states[k], x
+            )
+        return scores, states
+    def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
+        """Merge states for new hypothesis.
+        Args:
+            states: states of `self.full_scorers`
+            part_states: states of `self.part_scorers`
+            part_idx (int): The new token id for `part_scores`
+        Returns:
+            Dict[str, torch.Tensor]: The new score dict.
+                Its keys are names of `self.full_scorers` and `self.part_scorers`.
+                Its values are states of the scorers.
+        """
+        new_states = dict()
+        for k, v in states.items():
+            new_states[k] = v
+        for k, v in part_states.items():
+            new_states[k] = v
+        return new_states
+    def search(self, running_hyps: BatchHypothesis, x: torch.Tensor) -> BatchHypothesis:
+        """Search new tokens for running hypotheses and encoded speech x.
+        Args:
+            running_hyps (BatchHypothesis): Running hypotheses on beam
+            x (torch.Tensor): Encoded speech feature (T, D)
+        Returns:
+            BatchHypothesis: Best sorted hypotheses
+        """
+        n_batch = len(running_hyps)
+        part_ids = None  # no pre-beam
+        # batch scoring
+        weighted_scores = torch.zeros(
+            n_batch, self.n_vocab, dtype=x.dtype, device=x.device
+        )
+        scores, states = self.score_full(running_hyps, x.expand(n_batch, *x.shape))
+        for k in self.full_scorers:
+            weighted_scores += self.weights[k] * scores[k]
+        # partial scoring
+        if self.do_pre_beam:
+            pre_beam_scores = (
+                weighted_scores
+                if self.pre_beam_score_key == "full"
+                else scores[self.pre_beam_score_key]
+            )
+            part_ids = torch.topk(pre_beam_scores, self.pre_beam_size, dim=-1)[1]
+        # NOTE(takaaki-hori): Unlike BeamSearch, we assume that score_partial returns
+        # full-size score matrices, which has non-zero scores for part_ids and zeros
+        # for others.
+        part_scores, part_states = self.score_partial(running_hyps, part_ids, x)
+        for k in self.part_scorers:
+            weighted_scores += self.weights[k] * part_scores[k]
+        # add previous hyp scores
+        weighted_scores += running_hyps.score.to(
+            dtype=x.dtype, device=x.device
+        ).unsqueeze(1)
+        # TODO(karita): do not use list. use batch instead
+        # see also https://github.com/espnet/espnet/pull/1402#discussion_r354561029
+        # update hyps
+        best_hyps = []
+        prev_hyps = self.unbatchfy(running_hyps)
+        for (
+            full_prev_hyp_id,
+            full_new_token_id,
+            part_prev_hyp_id,
+            part_new_token_id,
+        ) in zip(*self.batch_beam(weighted_scores, part_ids)):
+            prev_hyp = prev_hyps[full_prev_hyp_id]
+            best_hyps.append(
+                Hypothesis(
+                    score=weighted_scores[full_prev_hyp_id, full_new_token_id],
+                    yseq=self.append_token(prev_hyp.yseq, full_new_token_id),
+                    scores=self.merge_scores(
+                        prev_hyp.scores,
+                        {k: v[full_prev_hyp_id] for k, v in scores.items()},
+                        full_new_token_id,
+                        {k: v[part_prev_hyp_id] for k, v in part_scores.items()},
+                        part_new_token_id,
+                    ),
+                    states=self.merge_states(
+                        {
+                            k: self.full_scorers[k].select_state(v, full_prev_hyp_id)
+                            for k, v in states.items()
+                        },
+                        {
+                            k: self.part_scorers[k].select_state(
+                                v, part_prev_hyp_id, part_new_token_id
+                            )
+                            for k, v in part_states.items()
+                        },
+                        part_new_token_id,
+                    ),
+                )
+            )
+        return self.batchfy(best_hyps)
+    def post_process(
+        self,
+        i: int,
+        maxlen: int,
+        maxlenratio: float,
+        running_hyps: BatchHypothesis,
+        ended_hyps: List[Hypothesis],
+    ) -> BatchHypothesis:
+        """Perform post-processing of beam search iterations.
+        Args:
+            i (int): The length of hypothesis tokens.
+            maxlen (int): The maximum length of tokens in beam search.
+            maxlenratio (int): The maximum length ratio in beam search.
+            running_hyps (BatchHypothesis): The running hypotheses in beam search.
+            ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.
+        Returns:
+            BatchHypothesis: The new running hypotheses.
+        """
+        n_batch = running_hyps.yseq.shape[0]
+        logging.debug(f"the number of running hypothes: {n_batch}")
+        if self.token_list is not None:
+            logging.debug(
+                "best hypo: "
+                + "".join(
+                    [
+                        self.token_list[x]
+                        for x in running_hyps.yseq[0, 1 : running_hyps.length[0]]
+                    ]
+                )
+            )
+        # add eos in the final loop to avoid that there are no ended hyps
+        if i == maxlen - 1:
+            logging.info("adding <eos> in the last position in the loop")
+            yseq_eos = torch.cat(
+                (
+                    running_hyps.yseq,
+                    torch.full(
+                        (n_batch, 1),
+                        self.eos,
+                        device=running_hyps.yseq.device,
+                        dtype=torch.int64,
+                    ),
+                ),
+                1,
+            )
+            running_hyps.yseq.resize_as_(yseq_eos)
+            running_hyps.yseq[:] = yseq_eos
+            running_hyps.length[:] = yseq_eos.shape[1]
+        # add ended hypotheses to a final list, and removed them from current hypotheses
+        # (this will be a probmlem, number of hyps < beam)
+        is_eos = (
+            running_hyps.yseq[torch.arange(n_batch), running_hyps.length - 1]
+            == self.eos
+        )
+        for b in torch.nonzero(is_eos, as_tuple=False).view(-1):
+            hyp = self._select(running_hyps, b)
+            ended_hyps.append(hyp)
+        remained_ids = torch.nonzero(is_eos == 0, as_tuple=False).view(-1)
+        return self._batch_select(running_hyps, remained_ids)

espnet/nets/beam_search.py ADDED Viewed

	@@ -0,0 +1,516 @@

+"""Beam search module."""
+from itertools import chain
+import logging
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import NamedTuple
+from typing import Tuple
+from typing import Union
+import torch
+from espnet.nets.e2e_asr_common import end_detect
+from espnet.nets.scorer_interface import PartialScorerInterface
+from espnet.nets.scorer_interface import ScorerInterface
+class Hypothesis(NamedTuple):
+    """Hypothesis data type."""
+    yseq: torch.Tensor
+    score: Union[float, torch.Tensor] = 0
+    scores: Dict[str, Union[float, torch.Tensor]] = dict()
+    states: Dict[str, Any] = dict()
+    def asdict(self) -> dict:
+        """Convert data to JSON-friendly dict."""
+        return self._replace(
+            yseq=self.yseq.tolist(),
+            score=float(self.score),
+            scores={k: float(v) for k, v in self.scores.items()},
+        )._asdict()
+class BeamSearch(torch.nn.Module):
+    """Beam search implementation."""
+    def __init__(
+        self,
+        scorers: Dict[str, ScorerInterface],
+        weights: Dict[str, float],
+        beam_size: int,
+        vocab_size: int,
+        sos: int,
+        eos: int,
+        token_list: List[str] = None,
+        pre_beam_ratio: float = 1.5,
+        pre_beam_score_key: str = None,
+    ):
+        """Initialize beam search.
+        Args:
+            scorers (dict[str, ScorerInterface]): Dict of decoder modules
+                e.g., Decoder, CTCPrefixScorer, LM
+                The scorer will be ignored if it is `None`
+            weights (dict[str, float]): Dict of weights for each scorers
+                The scorer will be ignored if its weight is 0
+            beam_size (int): The number of hypotheses kept during search
+            vocab_size (int): The number of vocabulary
+            sos (int): Start of sequence id
+            eos (int): End of sequence id
+            token_list (list[str]): List of tokens for debug log
+            pre_beam_score_key (str): key of scores to perform pre-beam search
+            pre_beam_ratio (float): beam size in the pre-beam search
+                will be `int(pre_beam_ratio * beam_size)`
+        """
+        super().__init__()
+        # set scorers
+        self.weights = weights
+        self.scorers = dict()
+        self.full_scorers = dict()
+        self.part_scorers = dict()
+        # this module dict is required for recursive cast
+        # `self.to(device, dtype)` in `recog.py`
+        self.nn_dict = torch.nn.ModuleDict()
+        for k, v in scorers.items():
+            w = weights.get(k, 0)
+            if w == 0 or v is None:
+                continue
+            assert isinstance(
+                v, ScorerInterface
+            ), f"{k} ({type(v)}) does not implement ScorerInterface"
+            self.scorers[k] = v
+            if isinstance(v, PartialScorerInterface):
+                self.part_scorers[k] = v
+            else:
+                self.full_scorers[k] = v
+            if isinstance(v, torch.nn.Module):
+                self.nn_dict[k] = v
+        # set configurations
+        self.sos = sos
+        self.eos = eos
+        self.token_list = token_list
+        self.pre_beam_size = int(pre_beam_ratio * beam_size)
+        self.beam_size = beam_size
+        self.n_vocab = vocab_size
+        if (
+            pre_beam_score_key is not None
+            and pre_beam_score_key != "full"
+            and pre_beam_score_key not in self.full_scorers
+        ):
+            raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}")
+        self.pre_beam_score_key = pre_beam_score_key
+        self.do_pre_beam = (
+            self.pre_beam_score_key is not None
+            and self.pre_beam_size < self.n_vocab
+            and len(self.part_scorers) > 0
+        )
+    def init_hyp(self, x: torch.Tensor) -> List[Hypothesis]:
+        """Get an initial hypothesis data.
+        Args:
+            x (torch.Tensor): The encoder output feature
+        Returns:
+            Hypothesis: The initial hypothesis.
+        """
+        init_states = dict()
+        init_scores = dict()
+        for k, d in self.scorers.items():
+            init_states[k] = d.init_state(x)
+            init_scores[k] = 0.0
+        return [
+            Hypothesis(
+                score=0.0,
+                scores=init_scores,
+                states=init_states,
+                yseq=torch.tensor([self.sos], device=x.device),
+            )
+        ]
+    @staticmethod
+    def append_token(xs: torch.Tensor, x: int) -> torch.Tensor:
+        """Append new token to prefix tokens.
+        Args:
+            xs (torch.Tensor): The prefix token
+            x (int): The new token to append
+        Returns:
+            torch.Tensor: New tensor contains: xs + [x] with xs.dtype and xs.device
+        """
+        x = torch.tensor([x], dtype=xs.dtype, device=xs.device)
+        return torch.cat((xs, x))
+    def score_full(
+        self, hyp: Hypothesis, x: torch.Tensor
+    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.full_scorers`.
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            x (torch.Tensor): Corresponding input feature
+        Returns:
+            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.full_scorers`
+                and tensor score values of shape: `(self.n_vocab,)`,
+                and state dict that has string keys
+                and state values of `self.full_scorers`
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.full_scorers.items():
+            scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
+        return scores, states
+    def score_partial(
+        self, hyp: Hypothesis, ids: torch.Tensor, x: torch.Tensor
+    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.part_scorers`.
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            ids (torch.Tensor): 1D tensor of new partial tokens to score
+            x (torch.Tensor): Corresponding input feature
+        Returns:
+            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.part_scorers`
+                and tensor score values of shape: `(len(ids),)`,
+                and state dict that has string keys
+                and state values of `self.part_scorers`
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.part_scorers.items():
+            scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x)
+        return scores, states
+    def beam(
+        self, weighted_scores: torch.Tensor, ids: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute topk full token ids and partial token ids.
+        Args:
+            weighted_scores (torch.Tensor): The weighted sum scores for each tokens.
+            Its shape is `(self.n_vocab,)`.
+            ids (torch.Tensor): The partial token ids to compute topk
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]:
+                The topk full token ids and partial token ids.
+                Their shapes are `(self.beam_size,)`
+        """
+        # no pre beam performed
+        if weighted_scores.size(0) == ids.size(0):
+            top_ids = weighted_scores.topk(self.beam_size)[1]
+            return top_ids, top_ids
+        # mask pruned in pre-beam not to select in topk
+        tmp = weighted_scores[ids]
+        weighted_scores[:] = -float("inf")
+        weighted_scores[ids] = tmp
+        top_ids = weighted_scores.topk(self.beam_size)[1]
+        local_ids = weighted_scores[ids].topk(self.beam_size)[1]
+        return top_ids, local_ids
+    @staticmethod
+    def merge_scores(
+        prev_scores: Dict[str, float],
+        next_full_scores: Dict[str, torch.Tensor],
+        full_idx: int,
+        next_part_scores: Dict[str, torch.Tensor],
+        part_idx: int,
+    ) -> Dict[str, torch.Tensor]:
+        """Merge scores for new hypothesis.
+        Args:
+            prev_scores (Dict[str, float]):
+                The previous hypothesis scores by `self.scorers`
+            next_full_scores (Dict[str, torch.Tensor]): scores by `self.full_scorers`
+            full_idx (int): The next token id for `next_full_scores`
+            next_part_scores (Dict[str, torch.Tensor]):
+                scores of partial tokens by `self.part_scorers`
+            part_idx (int): The new token id for `next_part_scores`
+        Returns:
+            Dict[str, torch.Tensor]: The new score dict.
+                Its keys are names of `self.full_scorers` and `self.part_scorers`.
+                Its values are scalar tensors by the scorers.
+        """
+        new_scores = dict()
+        for k, v in next_full_scores.items():
+            new_scores[k] = prev_scores[k] + v[full_idx]
+        for k, v in next_part_scores.items():
+            new_scores[k] = prev_scores[k] + v[part_idx]
+        return new_scores
+    def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
+        """Merge states for new hypothesis.
+        Args:
+            states: states of `self.full_scorers`
+            part_states: states of `self.part_scorers`
+            part_idx (int): The new token id for `part_scores`
+        Returns:
+            Dict[str, torch.Tensor]: The new score dict.
+                Its keys are names of `self.full_scorers` and `self.part_scorers`.
+                Its values are states of the scorers.
+        """
+        new_states = dict()
+        for k, v in states.items():
+            new_states[k] = v
+        for k, d in self.part_scorers.items():
+            new_states[k] = d.select_state(part_states[k], part_idx)
+        return new_states
+    def search(
+        self, running_hyps: List[Hypothesis], x: torch.Tensor
+    ) -> List[Hypothesis]:
+        """Search new tokens for running hypotheses and encoded speech x.
+        Args:
+            running_hyps (List[Hypothesis]): Running hypotheses on beam
+            x (torch.Tensor): Encoded speech feature (T, D)
+        Returns:
+            List[Hypotheses]: Best sorted hypotheses
+        """
+        best_hyps = []
+        part_ids = torch.arange(self.n_vocab, device=x.device)  # no pre-beam
+        for hyp in running_hyps:
+            # scoring
+            weighted_scores = torch.zeros(self.n_vocab, dtype=x.dtype, device=x.device)
+            scores, states = self.score_full(hyp, x)
+            for k in self.full_scorers:
+                weighted_scores += self.weights[k] * scores[k]
+            # partial scoring
+            if self.do_pre_beam:
+                pre_beam_scores = (
+                    weighted_scores
+                    if self.pre_beam_score_key == "full"
+                    else scores[self.pre_beam_score_key]
+                )
+                part_ids = torch.topk(pre_beam_scores, self.pre_beam_size)[1]
+            part_scores, part_states = self.score_partial(hyp, part_ids, x)
+            for k in self.part_scorers:
+                weighted_scores[part_ids] += self.weights[k] * part_scores[k]
+            # add previous hyp score
+            weighted_scores += hyp.score
+            # update hyps
+            for j, part_j in zip(*self.beam(weighted_scores, part_ids)):
+                # will be (2 x beam at most)
+                best_hyps.append(
+                    Hypothesis(
+                        score=weighted_scores[j],
+                        yseq=self.append_token(hyp.yseq, j),
+                        scores=self.merge_scores(
+                            hyp.scores, scores, j, part_scores, part_j
+                        ),
+                        states=self.merge_states(states, part_states, part_j),
+                    )
+                )
+            # sort and prune 2 x beam -> beam
+            best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[
+                : min(len(best_hyps), self.beam_size)
+            ]
+        return best_hyps
+    def forward(
+        self, x: torch.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0
+    ) -> List[Hypothesis]:
+        """Perform beam search.
+        Args:
+            x (torch.Tensor): Encoded speech feature (T, D)
+            maxlenratio (float): Input length ratio to obtain max output length.
+                If maxlenratio=0.0 (default), it uses a end-detect function
+                to automatically find maximum hypothesis lengths
+                If maxlenratio<0.0, its absolute value is interpreted
+                as a constant max output length.
+            minlenratio (float): Input length ratio to obtain min output length.
+        Returns:
+            list[Hypothesis]: N-best decoding results
+        """
+        # set length bounds
+        if maxlenratio == 0:
+            maxlen = x.shape[0]
+        elif maxlenratio < 0:
+            maxlen = -1 * int(maxlenratio)
+        else:
+            maxlen = max(1, int(maxlenratio * x.size(0)))
+        minlen = int(minlenratio * x.size(0))
+        logging.info("decoder input length: " + str(x.shape[0]))
+        logging.info("max output length: " + str(maxlen))
+        logging.info("min output length: " + str(minlen))
+        # main loop of prefix search
+        running_hyps = self.init_hyp(x)
+        ended_hyps = []
+        for i in range(maxlen):
+            logging.debug("position " + str(i))
+            best = self.search(running_hyps, x)
+            # post process of one iteration
+            running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps)
+            # end detection
+            if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i):
+                logging.info(f"end detected at {i}")
+                break
+            if len(running_hyps) == 0:
+                logging.info("no hypothesis. Finish decoding.")
+                break
+            else:
+                logging.debug(f"remained hypotheses: {len(running_hyps)}")
+        nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
+        # check the number of hypotheses reaching to eos
+        if len(nbest_hyps) == 0:
+            logging.warning(
+                "there is no N-best results, perform recognition "
+                "again with smaller minlenratio."
+            )
+            return (
+                []
+                if minlenratio < 0.1
+                else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
+            )
+        # report the best result
+        best = nbest_hyps[0]
+        for k, v in best.scores.items():
+            logging.info(
+                f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}"
+            )
+        logging.info(f"total log probability: {best.score:.2f}")
+        logging.info(f"normalized log probability: {best.score / len(best.yseq):.2f}")
+        logging.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
+        if self.token_list is not None:
+            logging.info(
+                "best hypo: "
+                + "".join([self.token_list[x] for x in best.yseq[1:-1]])
+                + "\n"
+            )
+        return nbest_hyps
+    def post_process(
+        self,
+        i: int,
+        maxlen: int,
+        maxlenratio: float,
+        running_hyps: List[Hypothesis],
+        ended_hyps: List[Hypothesis],
+    ) -> List[Hypothesis]:
+        """Perform post-processing of beam search iterations.
+        Args:
+            i (int): The length of hypothesis tokens.
+            maxlen (int): The maximum length of tokens in beam search.
+            maxlenratio (int): The maximum length ratio in beam search.
+            running_hyps (List[Hypothesis]): The running hypotheses in beam search.
+            ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.
+        Returns:
+            List[Hypothesis]: The new running hypotheses.
+        """
+        logging.debug(f"the number of running hypotheses: {len(running_hyps)}")
+        if self.token_list is not None:
+            logging.debug(
+                "best hypo: "
+                + "".join([self.token_list[x] for x in running_hyps[0].yseq[1:]])
+            )
+        # add eos in the final loop to avoid that there are no ended hyps
+        if i == maxlen - 1:
+            logging.info("adding <eos> in the last position in the loop")
+            running_hyps = [
+                h._replace(yseq=self.append_token(h.yseq, self.eos))
+                for h in running_hyps
+            ]
+        # add ended hypotheses to a final list, and removed them from current hypotheses
+        # (this will be a problem, number of hyps < beam)
+        remained_hyps = []
+        for hyp in running_hyps:
+            if hyp.yseq[-1] == self.eos:
+                # e.g., Word LM needs to add final <eos> score
+                for k, d in chain(self.full_scorers.items(), self.part_scorers.items()):
+                    s = d.final_score(hyp.states[k])
+                    hyp.scores[k] += s
+                    hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
+                ended_hyps.append(hyp)
+            else:
+                remained_hyps.append(hyp)
+        return remained_hyps
+def beam_search(
+    x: torch.Tensor,
+    sos: int,
+    eos: int,
+    beam_size: int,
+    vocab_size: int,
+    scorers: Dict[str, ScorerInterface],
+    weights: Dict[str, float],
+    token_list: List[str] = None,
+    maxlenratio: float = 0.0,
+    minlenratio: float = 0.0,
+    pre_beam_ratio: float = 1.5,
+    pre_beam_score_key: str = "full",
+) -> list:
+    """Perform beam search with scorers.
+    Args:
+        x (torch.Tensor): Encoded speech feature (T, D)
+        sos (int): Start of sequence id
+        eos (int): End of sequence id
+        beam_size (int): The number of hypotheses kept during search
+        vocab_size (int): The number of vocabulary
+        scorers (dict[str, ScorerInterface]): Dict of decoder modules
+            e.g., Decoder, CTCPrefixScorer, LM
+            The scorer will be ignored if it is `None`
+        weights (dict[str, float]): Dict of weights for each scorers
+            The scorer will be ignored if its weight is 0
+        token_list (list[str]): List of tokens for debug log
+        maxlenratio (float): Input length ratio to obtain max output length.
+            If maxlenratio=0.0 (default), it uses a end-detect function
+            to automatically find maximum hypothesis lengths
+        minlenratio (float): Input length ratio to obtain min output length.
+        pre_beam_score_key (str): key of scores to perform pre-beam search
+        pre_beam_ratio (float): beam size in the pre-beam search
+            will be `int(pre_beam_ratio * beam_size)`
+    Returns:
+        list: N-best decoding results
+    """
+    ret = BeamSearch(
+        scorers,
+        weights,
+        beam_size=beam_size,
+        vocab_size=vocab_size,
+        pre_beam_ratio=pre_beam_ratio,
+        pre_beam_score_key=pre_beam_score_key,
+        sos=sos,
+        eos=eos,
+        token_list=token_list,
+    ).forward(x=x, maxlenratio=maxlenratio, minlenratio=minlenratio)
+    return [h.asdict() for h in ret]

espnet/nets/ctc_prefix_score.py ADDED Viewed

	@@ -0,0 +1,359 @@

+#!/usr/bin/env python3
+# Copyright 2018 Mitsubishi Electric Research Labs (Takaaki Hori)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+import torch
+import numpy as np
+import six
+class CTCPrefixScoreTH(object):
+    """Batch processing of CTCPrefixScore
+    which is based on Algorithm 2 in WATANABE et al.
+    "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION,"
+    but extended to efficiently compute the label probablities for multiple
+    hypotheses simultaneously
+    See also Seki et al. "Vectorized Beam Search for CTC-Attention-Based
+    Speech Recognition," In INTERSPEECH (pp. 3825-3829), 2019.
+    """
+    def __init__(self, x, xlens, blank, eos, margin=0):
+        """Construct CTC prefix scorer
+        :param torch.Tensor x: input label posterior sequences (B, T, O)
+        :param torch.Tensor xlens: input lengths (B,)
+        :param int blank: blank label id
+        :param int eos: end-of-sequence id
+        :param int margin: margin parameter for windowing (0 means no windowing)
+        """
+        # In the comment lines,
+        # we assume T: input_length, B: batch size, W: beam width, O: output dim.
+        self.logzero = -10000000000.0
+        self.blank = blank
+        self.eos = eos
+        self.batch = x.size(0)
+        self.input_length = x.size(1)
+        self.odim = x.size(2)
+        self.dtype = x.dtype
+        self.device = (
+            torch.device("cuda:%d" % x.get_device())
+            if x.is_cuda
+            else torch.device("cpu")
+        )
+        # Pad the rest of posteriors in the batch
+        # TODO(takaaki-hori): need a better way without for-loops
+        for i, l in enumerate(xlens):
+            if l < self.input_length:
+                x[i, l:, :] = self.logzero
+                x[i, l:, blank] = 0
+        # Reshape input x
+        xn = x.transpose(0, 1)  # (B, T, O) -> (T, B, O)
+        xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
+        self.x = torch.stack([xn, xb])  # (2, T, B, O)
+        self.end_frames = torch.as_tensor(xlens) - 1
+        # Setup CTC windowing
+        self.margin = margin
+        if margin > 0:
+            self.frame_ids = torch.arange(
+                self.input_length, dtype=self.dtype, device=self.device
+            )
+        # Base indices for index conversion
+        self.idx_bh = None
+        self.idx_b = torch.arange(self.batch, device=self.device)
+        self.idx_bo = (self.idx_b * self.odim).unsqueeze(1)
+    def __call__(self, y, state, scoring_ids=None, att_w=None):
+        """Compute CTC prefix scores for next labels
+        :param list y: prefix label sequences
+        :param tuple state: previous CTC state
+        :param torch.Tensor pre_scores: scores for pre-selection of hypotheses (BW, O)
+        :param torch.Tensor att_w: attention weights to decide CTC window
+        :return new_state, ctc_local_scores (BW, O)
+        """
+        output_length = len(y[0]) - 1  # ignore sos
+        last_ids = [yi[-1] for yi in y]  # last output label ids
+        n_bh = len(last_ids)  # batch * hyps
+        n_hyps = n_bh // self.batch  # assuming each utterance has the same # of hyps
+        self.scoring_num = scoring_ids.size(-1) if scoring_ids is not None else 0
+        # prepare state info
+        if state is None:
+            r_prev = torch.full(
+                (self.input_length, 2, self.batch, n_hyps),
+                self.logzero,
+                dtype=self.dtype,
+                device=self.device,
+            )
+            r_prev[:, 1] = torch.cumsum(self.x[0, :, :, self.blank], 0).unsqueeze(2)
+            r_prev = r_prev.view(-1, 2, n_bh)
+            s_prev = 0.0
+            f_min_prev = 0
+            f_max_prev = 1
+        else:
+            r_prev, s_prev, f_min_prev, f_max_prev = state
+        # select input dimensions for scoring
+        if self.scoring_num > 0:
+            scoring_idmap = torch.full(
+                (n_bh, self.odim), -1, dtype=torch.long, device=self.device
+            )
+            snum = self.scoring_num
+            if self.idx_bh is None or n_bh > len(self.idx_bh):
+                self.idx_bh = torch.arange(n_bh, device=self.device).view(-1, 1)
+            scoring_idmap[self.idx_bh[:n_bh], scoring_ids] = torch.arange(
+                snum, device=self.device
+            )
+            scoring_idx = (
+                scoring_ids + self.idx_bo.repeat(1, n_hyps).view(-1, 1)
+            ).view(-1)
+            x_ = torch.index_select(
+                self.x.view(2, -1, self.batch * self.odim), 2, scoring_idx
+            ).view(2, -1, n_bh, snum)
+        else:
+            scoring_ids = None
+            scoring_idmap = None
+            snum = self.odim
+            x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).view(2, -1, n_bh, snum)
+        # new CTC forward probs are prepared as a (T x 2 x BW x S) tensor
+        # that corresponds to r_t^n(h) and r_t^b(h) in a batch.
+        r = torch.full(
+            (self.input_length, 2, n_bh, snum),
+            self.logzero,
+            dtype=self.dtype,
+            device=self.device,
+        )
+        if output_length == 0:
+            r[0, 0] = x_[0, 0]
+        r_sum = torch.logsumexp(r_prev, 1)
+        log_phi = r_sum.unsqueeze(2).repeat(1, 1, snum)
+        if scoring_ids is not None:
+            for idx in range(n_bh):
+                pos = scoring_idmap[idx, last_ids[idx]]
+                if pos >= 0:
+                    log_phi[:, idx, pos] = r_prev[:, 1, idx]
+        else:
+            for idx in range(n_bh):
+                log_phi[:, idx, last_ids[idx]] = r_prev[:, 1, idx]
+        # decide start and end frames based on attention weights
+        if att_w is not None and self.margin > 0:
+            f_arg = torch.matmul(att_w, self.frame_ids)
+            f_min = max(int(f_arg.min().cpu()), f_min_prev)
+            f_max = max(int(f_arg.max().cpu()), f_max_prev)
+            start = min(f_max_prev, max(f_min - self.margin, output_length, 1))
+            end = min(f_max + self.margin, self.input_length)
+        else:
+            f_min = f_max = 0
+            start = max(output_length, 1)
+            end = self.input_length
+        # compute forward probabilities log(r_t^n(h)) and log(r_t^b(h))
+        for t in range(start, end):
+            rp = r[t - 1]
+            rr = torch.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view(
+                2, 2, n_bh, snum
+            )
+            r[t] = torch.logsumexp(rr, 1) + x_[:, t]
+        # compute log prefix probabilities log(psi)
+        log_phi_x = torch.cat((log_phi[0].unsqueeze(0), log_phi[:-1]), dim=0) + x_[0]
+        if scoring_ids is not None:
+            log_psi = torch.full(
+                (n_bh, self.odim), self.logzero, dtype=self.dtype, device=self.device
+            )
+            log_psi_ = torch.logsumexp(
+                torch.cat((log_phi_x[start:end], r[start - 1, 0].unsqueeze(0)), dim=0),
+                dim=0,
+            )
+            for si in range(n_bh):
+                log_psi[si, scoring_ids[si]] = log_psi_[si]
+        else:
+            log_psi = torch.logsumexp(
+                torch.cat((log_phi_x[start:end], r[start - 1, 0].unsqueeze(0)), dim=0),
+                dim=0,
+            )
+        for si in range(n_bh):
+            log_psi[si, self.eos] = r_sum[self.end_frames[si // n_hyps], si]
+        # exclude blank probs
+        log_psi[:, self.blank] = self.logzero
+        return (log_psi - s_prev), (r, log_psi, f_min, f_max, scoring_idmap)
+    def index_select_state(self, state, best_ids):
+        """Select CTC states according to best ids
+        :param state    : CTC state
+        :param best_ids : index numbers selected by beam pruning (B, W)
+        :return selected_state
+        """
+        r, s, f_min, f_max, scoring_idmap = state
+        # convert ids to BHO space
+        n_bh = len(s)
+        n_hyps = n_bh // self.batch
+        vidx = (best_ids + (self.idx_b * (n_hyps * self.odim)).view(-1, 1)).view(-1)
+        # select hypothesis scores
+        s_new = torch.index_select(s.view(-1), 0, vidx)
+        s_new = s_new.view(-1, 1).repeat(1, self.odim).view(n_bh, self.odim)
+        # convert ids to BHS space (S: scoring_num)
+        if scoring_idmap is not None:
+            snum = self.scoring_num
+            hyp_idx = (best_ids // self.odim + (self.idx_b * n_hyps).view(-1, 1)).view(
+                -1
+            )
+            label_ids = torch.fmod(best_ids, self.odim).view(-1)
+            score_idx = scoring_idmap[hyp_idx, label_ids]
+            score_idx[score_idx == -1] = 0
+            vidx = score_idx + hyp_idx * snum
+        else:
+            snum = self.odim
+        # select forward probabilities
+        r_new = torch.index_select(r.view(-1, 2, n_bh * snum), 2, vidx).view(
+            -1, 2, n_bh
+        )
+        return r_new, s_new, f_min, f_max
+    def extend_prob(self, x):
+        """Extend CTC prob.
+        :param torch.Tensor x: input label posterior sequences (B, T, O)
+        """
+        if self.x.shape[1] < x.shape[1]:  # self.x (2,T,B,O); x (B,T,O)
+            # Pad the rest of posteriors in the batch
+            # TODO(takaaki-hori): need a better way without for-loops
+            xlens = [x.size(1)]
+            for i, l in enumerate(xlens):
+                if l < self.input_length:
+                    x[i, l:, :] = self.logzero
+                    x[i, l:, self.blank] = 0
+            tmp_x = self.x
+            xn = x.transpose(0, 1)  # (B, T, O) -> (T, B, O)
+            xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
+            self.x = torch.stack([xn, xb])  # (2, T, B, O)
+            self.x[:, : tmp_x.shape[1], :, :] = tmp_x
+            self.input_length = x.size(1)
+            self.end_frames = torch.as_tensor(xlens) - 1
+    def extend_state(self, state):
+        """Compute CTC prefix state.
+        :param state    : CTC state
+        :return ctc_state
+        """
+        if state is None:
+            # nothing to do
+            return state
+        else:
+            r_prev, s_prev, f_min_prev, f_max_prev = state
+            r_prev_new = torch.full(
+                (self.input_length, 2),
+                self.logzero,
+                dtype=self.dtype,
+                device=self.device,
+            )
+            start = max(r_prev.shape[0], 1)
+            r_prev_new[0:start] = r_prev
+            for t in six.moves.range(start, self.input_length):
+                r_prev_new[t, 1] = r_prev_new[t - 1, 1] + self.x[0, t, :, self.blank]
+            return (r_prev_new, s_prev, f_min_prev, f_max_prev)
+class CTCPrefixScore(object):
+    """Compute CTC label sequence scores
+    which is based on Algorithm 2 in WATANABE et al.
+    "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION,"
+    but extended to efficiently compute the probablities of multiple labels
+    simultaneously
+    """
+    def __init__(self, x, blank, eos, xp):
+        self.xp = xp
+        self.logzero = -10000000000.0
+        self.blank = blank
+        self.eos = eos
+        self.input_length = len(x)
+        self.x = x
+    def initial_state(self):
+        """Obtain an initial CTC state
+        :return: CTC state
+        """
+        # initial CTC state is made of a frame x 2 tensor that corresponds to
+        # r_t^n(<sos>) and r_t^b(<sos>), where 0 and 1 of axis=1 represent
+        # superscripts n and b (non-blank and blank), respectively.
+        r = self.xp.full((self.input_length, 2), self.logzero, dtype=np.float32)
+        r[0, 1] = self.x[0, self.blank]
+        for i in six.moves.range(1, self.input_length):
+            r[i, 1] = r[i - 1, 1] + self.x[i, self.blank]
+        return r
+    def __call__(self, y, cs, r_prev):
+        """Compute CTC prefix scores for next labels
+        :param y     : prefix label sequence
+        :param cs    : array of next labels
+        :param r_prev: previous CTC state
+        :return ctc_scores, ctc_states
+        """
+        # initialize CTC states
+        output_length = len(y) - 1  # ignore sos
+        # new CTC states are prepared as a frame x (n or b) x n_labels tensor
+        # that corresponds to r_t^n(h) and r_t^b(h).
+        r = self.xp.ndarray((self.input_length, 2, len(cs)), dtype=np.float32)
+        xs = self.x[:, cs]
+        if output_length == 0:
+            r[0, 0] = xs[0]
+            r[0, 1] = self.logzero
+        else:
+            r[output_length - 1] = self.logzero
+        # prepare forward probabilities for the last label
+        r_sum = self.xp.logaddexp(
+            r_prev[:, 0], r_prev[:, 1]
+        )  # log(r_t^n(g) + r_t^b(g))
+        last = y[-1]
+        if output_length > 0 and last in cs:
+            log_phi = self.xp.ndarray((self.input_length, len(cs)), dtype=np.float32)
+            for i in six.moves.range(len(cs)):
+                log_phi[:, i] = r_sum if cs[i] != last else r_prev[:, 1]
+        else:
+            log_phi = r_sum
+        # compute forward probabilities log(r_t^n(h)), log(r_t^b(h)),
+        # and log prefix probabilities log(psi)
+        start = max(output_length, 1)
+        log_psi = r[start - 1, 0]
+        for t in six.moves.range(start, self.input_length):
+            r[t, 0] = self.xp.logaddexp(r[t - 1, 0], log_phi[t - 1]) + xs[t]
+            r[t, 1] = (
+                self.xp.logaddexp(r[t - 1, 0], r[t - 1, 1]) + self.x[t, self.blank]
+            )
+            log_psi = self.xp.logaddexp(log_psi, log_phi[t - 1] + xs[t])
+        # get P(...eos|X) that ends with the prefix itself
+        eos_pos = self.xp.where(cs == self.eos)[0]
+        if len(eos_pos) > 0:
+            log_psi[eos_pos] = r_sum[-1]  # log(r_T^n(g) + r_T^b(g))
+        # exclude blank probs
+        blank_pos = self.xp.where(cs == self.blank)[0]
+        if len(blank_pos) > 0:
+            log_psi[blank_pos] = self.logzero
+        # return the log prefix probability and CTC states, where the label axis
+        # of the CTC states is moved to the first axis to slice it easily
+        return log_psi, self.xp.rollaxis(r, 2)

espnet/nets/e2e_asr_common.py ADDED Viewed

	@@ -0,0 +1,249 @@

+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Common functions for ASR."""
+import json
+import logging
+import sys
+from itertools import groupby
+import numpy as np
+import six
+def end_detect(ended_hyps, i, M=3, D_end=np.log(1 * np.exp(-10))):
+    """End detection.
+    described in Eq. (50) of S. Watanabe et al
+    "Hybrid CTC/Attention Architecture for End-to-End Speech Recognition"
+    :param ended_hyps:
+    :param i:
+    :param M:
+    :param D_end:
+    :return:
+    """
+    if len(ended_hyps) == 0:
+        return False
+    count = 0
+    best_hyp = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[0]
+    for m in six.moves.range(M):
+        # get ended_hyps with their length is i - m
+        hyp_length = i - m
+        hyps_same_length = [x for x in ended_hyps if len(x["yseq"]) == hyp_length]
+        if len(hyps_same_length) > 0:
+            best_hyp_same_length = sorted(
+                hyps_same_length, key=lambda x: x["score"], reverse=True
+            )[0]
+            if best_hyp_same_length["score"] - best_hyp["score"] < D_end:
+                count += 1
+    if count == M:
+        return True
+    else:
+        return False
+# TODO(takaaki-hori): add different smoothing methods
+def label_smoothing_dist(odim, lsm_type, transcript=None, blank=0):
+    """Obtain label distribution for loss smoothing.
+    :param odim:
+    :param lsm_type:
+    :param blank:
+    :param transcript:
+    :return:
+    """
+    if transcript is not None:
+        with open(transcript, "rb") as f:
+            trans_json = json.load(f)["utts"]
+    if lsm_type == "unigram":
+        assert transcript is not None, (
+            "transcript is required for %s label smoothing" % lsm_type
+        )
+        labelcount = np.zeros(odim)
+        for k, v in trans_json.items():
+            ids = np.array([int(n) for n in v["output"][0]["tokenid"].split()])
+            # to avoid an error when there is no text in an uttrance
+            if len(ids) > 0:
+                labelcount[ids] += 1
+        labelcount[odim - 1] = len(transcript)  # count <eos>
+        labelcount[labelcount == 0] = 1  # flooring
+        labelcount[blank] = 0  # remove counts for blank
+        labeldist = labelcount.astype(np.float32) / np.sum(labelcount)
+    else:
+        logging.error("Error: unexpected label smoothing type: %s" % lsm_type)
+        sys.exit()
+    return labeldist
+def get_vgg2l_odim(idim, in_channel=3, out_channel=128):
+    """Return the output size of the VGG frontend.
+    :param in_channel: input channel size
+    :param out_channel: output channel size
+    :return: output size
+    :rtype int
+    """
+    idim = idim / in_channel
+    idim = np.ceil(np.array(idim, dtype=np.float32) / 2)  # 1st max pooling
+    idim = np.ceil(np.array(idim, dtype=np.float32) / 2)  # 2nd max pooling
+    return int(idim) * out_channel  # numer of channels
+class ErrorCalculator(object):
+    """Calculate CER and WER for E2E_ASR and CTC models during training.
+    :param y_hats: numpy array with predicted text
+    :param y_pads: numpy array with true (target) text
+    :param char_list:
+    :param sym_space:
+    :param sym_blank:
+    :return:
+    """
+    def __init__(
+        self, char_list, sym_space, sym_blank, report_cer=False, report_wer=False
+    ):
+        """Construct an ErrorCalculator object."""
+        super(ErrorCalculator, self).__init__()
+        self.report_cer = report_cer
+        self.report_wer = report_wer
+        self.char_list = char_list
+        self.space = sym_space
+        self.blank = sym_blank
+        self.idx_blank = self.char_list.index(self.blank)
+        if self.space in self.char_list:
+            self.idx_space = self.char_list.index(self.space)
+        else:
+            self.idx_space = None
+    def __call__(self, ys_hat, ys_pad, is_ctc=False):
+        """Calculate sentence-level WER/CER score.
+        :param torch.Tensor ys_hat: prediction (batch, seqlen)
+        :param torch.Tensor ys_pad: reference (batch, seqlen)
+        :param bool is_ctc: calculate CER score for CTC
+        :return: sentence-level WER score
+        :rtype float
+        :return: sentence-level CER score
+        :rtype float
+        """
+        cer, wer = None, None
+        if is_ctc:
+            return self.calculate_cer_ctc(ys_hat, ys_pad)
+        elif not self.report_cer and not self.report_wer:
+            return cer, wer
+        seqs_hat, seqs_true = self.convert_to_char(ys_hat, ys_pad)
+        if self.report_cer:
+            cer = self.calculate_cer(seqs_hat, seqs_true)
+        if self.report_wer:
+            wer = self.calculate_wer(seqs_hat, seqs_true)
+        return cer, wer
+    def calculate_cer_ctc(self, ys_hat, ys_pad):
+        """Calculate sentence-level CER score for CTC.
+        :param torch.Tensor ys_hat: prediction (batch, seqlen)
+        :param torch.Tensor ys_pad: reference (batch, seqlen)
+        :return: average sentence-level CER score
+        :rtype float
+        """
+        import editdistance
+        cers, char_ref_lens = [], []
+        for i, y in enumerate(ys_hat):
+            y_hat = [x[0] for x in groupby(y)]
+            y_true = ys_pad[i]
+            seq_hat, seq_true = [], []
+            for idx in y_hat:
+                idx = int(idx)
+                if idx != -1 and idx != self.idx_blank and idx != self.idx_space:
+                    seq_hat.append(self.char_list[int(idx)])
+            for idx in y_true:
+                idx = int(idx)
+                if idx != -1 and idx != self.idx_blank and idx != self.idx_space:
+                    seq_true.append(self.char_list[int(idx)])
+            hyp_chars = "".join(seq_hat)
+            ref_chars = "".join(seq_true)
+            if len(ref_chars) > 0:
+                cers.append(editdistance.eval(hyp_chars, ref_chars))
+                char_ref_lens.append(len(ref_chars))
+        cer_ctc = float(sum(cers)) / sum(char_ref_lens) if cers else None
+        return cer_ctc
+    def convert_to_char(self, ys_hat, ys_pad):
+        """Convert index to character.
+        :param torch.Tensor seqs_hat: prediction (batch, seqlen)
+        :param torch.Tensor seqs_true: reference (batch, seqlen)
+        :return: token list of prediction
+        :rtype list
+        :return: token list of reference
+        :rtype list
+        """
+        seqs_hat, seqs_true = [], []
+        for i, y_hat in enumerate(ys_hat):
+            y_true = ys_pad[i]
+            eos_true = np.where(y_true == -1)[0]
+            ymax = eos_true[0] if len(eos_true) > 0 else len(y_true)
+            # NOTE: padding index (-1) in y_true is used to pad y_hat
+            seq_hat = [self.char_list[int(idx)] for idx in y_hat[:ymax]]
+            seq_true = [self.char_list[int(idx)] for idx in y_true if int(idx) != -1]
+            seq_hat_text = "".join(seq_hat).replace(self.space, " ")
+            seq_hat_text = seq_hat_text.replace(self.blank, "")
+            seq_true_text = "".join(seq_true).replace(self.space, " ")
+            seqs_hat.append(seq_hat_text)
+            seqs_true.append(seq_true_text)
+        return seqs_hat, seqs_true
+    def calculate_cer(self, seqs_hat, seqs_true):
+        """Calculate sentence-level CER score.
+        :param list seqs_hat: prediction
+        :param list seqs_true: reference
+        :return: average sentence-level CER score
+        :rtype float
+        """
+        import editdistance
+        char_eds, char_ref_lens = [], []
+        for i, seq_hat_text in enumerate(seqs_hat):
+            seq_true_text = seqs_true[i]
+            hyp_chars = seq_hat_text.replace(" ", "")
+            ref_chars = seq_true_text.replace(" ", "")
+            char_eds.append(editdistance.eval(hyp_chars, ref_chars))
+            char_ref_lens.append(len(ref_chars))
+        return float(sum(char_eds)) / sum(char_ref_lens)
+    def calculate_wer(self, seqs_hat, seqs_true):
+        """Calculate sentence-level WER score.
+        :param list seqs_hat: prediction
+        :param list seqs_true: reference
+        :return: average sentence-level WER score
+        :rtype float
+        """
+        import editdistance
+        word_eds, word_ref_lens = [], []
+        for i, seq_hat_text in enumerate(seqs_hat):
+            seq_true_text = seqs_true[i]
+            hyp_words = seq_hat_text.split()
+            ref_words = seq_true_text.split()
+            word_eds.append(editdistance.eval(hyp_words, ref_words))
+            word_ref_lens.append(len(ref_words))
+        return float(sum(word_eds)) / sum(word_ref_lens)

espnet/nets/lm_interface.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""Language model interface."""
+import argparse
+from espnet.nets.scorer_interface import ScorerInterface
+from espnet.utils.dynamic_import import dynamic_import
+from espnet.utils.fill_missing_args import fill_missing_args
+class LMInterface(ScorerInterface):
+    """LM Interface for ESPnet model implementation."""
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments to command line argument parser."""
+        return parser
+    @classmethod
+    def build(cls, n_vocab: int, **kwargs):
+        """Initialize this class with python-level args.
+        Args:
+            idim (int): The number of vocabulary.
+        Returns:
+            LMinterface: A new instance of LMInterface.
+        """
+        # local import to avoid cyclic import in lm_train
+        from espnet.bin.lm_train import get_parser
+        def wrap(parser):
+            return get_parser(parser, required=False)
+        args = argparse.Namespace(**kwargs)
+        args = fill_missing_args(args, wrap)
+        args = fill_missing_args(args, cls.add_arguments)
+        return cls(n_vocab, args)
+    def forward(self, x, t):
+        """Compute LM loss value from buffer sequences.
+        Args:
+            x (torch.Tensor): Input ids. (batch, len)
+            t (torch.Tensor): Target ids. (batch, len)
+        Returns:
+            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tuple of
+                loss to backward (scalar),
+                negative log-likelihood of t: -log p(t) (scalar) and
+                the number of elements in x (scalar)
+        Notes:
+            The last two return values are used
+            in perplexity: p(t)^{-n} = exp(-log p(t) / n)
+        """
+        raise NotImplementedError("forward method is not implemented")
+predefined_lms = {
+    "pytorch": {
+        "default": "espnet.nets.pytorch_backend.lm.default:DefaultRNNLM",
+        "seq_rnn": "espnet.nets.pytorch_backend.lm.seq_rnn:SequentialRNNLM",
+        "transformer": "espnet.nets.pytorch_backend.lm.transformer:TransformerLM",
+    },
+    "chainer": {"default": "espnet.lm.chainer_backend.lm:DefaultRNNLM"},
+}
+def dynamic_import_lm(module, backend):
+    """Import LM class dynamically.
+    Args:
+        module (str): module_name:class_name or alias in `predefined_lms`
+        backend (str): NN backend. e.g., pytorch, chainer
+    Returns:
+        type: LM class
+    """
+    model_class = dynamic_import(module, predefined_lms.get(backend, dict()))
+    assert issubclass(
+        model_class, LMInterface
+    ), f"{module} does not implement LMInterface"
+    return model_class

espnet/nets/pytorch_backend/backbones/conv1d_extractor.py ADDED Viewed

	@@ -0,0 +1,25 @@

+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2021 Imperial College London (Pingchuan Ma)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+import torch
+from espnet.nets.pytorch_backend.backbones.modules.resnet1d import ResNet1D, BasicBlock1D
+class Conv1dResNet(torch.nn.Module):
+    def __init__(self, relu_type="swish", a_upsample_ratio=1):
+        super().__init__()
+        self.a_upsample_ratio = a_upsample_ratio
+        self.trunk = ResNet1D(BasicBlock1D, [2, 2, 2, 2], relu_type=relu_type, a_upsample_ratio=a_upsample_ratio)
+    def forward(self, xs_pad):
+        """forward.
+        :param xs_pad: torch.Tensor, batch of padded input sequences (B, Tmax, idim)
+        """
+        B, T, C = xs_pad.size()
+        xs_pad = xs_pad[:, :T // 640 * 640, :]
+        xs_pad = xs_pad.transpose(1, 2)
+        xs_pad = self.trunk(xs_pad)
+        return xs_pad.transpose(1, 2)

espnet/nets/pytorch_backend/backbones/conv3d_extractor.py ADDED Viewed

	@@ -0,0 +1,47 @@

+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2021 Imperial College London (Pingchuan Ma)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+import torch
+import torch.nn as nn
+from espnet.nets.pytorch_backend.backbones.modules.resnet import ResNet, BasicBlock
+from espnet.nets.pytorch_backend.transformer.convolution import Swish
+def threeD_to_2D_tensor(x):
+    n_batch, n_channels, s_time, sx, sy = x.shape
+    x = x.transpose(1, 2)
+    return x.reshape(n_batch * s_time, n_channels, sx, sy)
+class Conv3dResNet(torch.nn.Module):
+    """Conv3dResNet module
+    """
+    def __init__(self, backbone_type="resnet", relu_type="swish"):
+        """__init__.
+        :param backbone_type: str, the type of a visual front-end.
+        :param relu_type: str, activation function used in an audio front-end.
+        """
+        super(Conv3dResNet, self).__init__()
+        self.frontend_nout = 64
+        self.trunk = ResNet(BasicBlock, [2, 2, 2, 2], relu_type=relu_type)
+        self.frontend3D = nn.Sequential(
+            nn.Conv3d(1, self.frontend_nout, (5, 7, 7), (1, 2, 2), (2, 3, 3), bias=False),
+            nn.BatchNorm3d(self.frontend_nout),
+            Swish(),
+            nn.MaxPool3d((1, 3, 3), (1, 2, 2), (0, 1, 1))
+        )
+    def forward(self, xs_pad):
+        B, C, T, H, W = xs_pad.size()
+        xs_pad = self.frontend3D(xs_pad)
+        Tnew = xs_pad.shape[2]
+        xs_pad = threeD_to_2D_tensor(xs_pad)
+        xs_pad = self.trunk(xs_pad)
+        return xs_pad.view(B, Tnew, xs_pad.size(1))

espnet/nets/pytorch_backend/backbones/modules/resnet.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import math
+import torch.nn as nn
+import pdb
+from espnet.nets.pytorch_backend.transformer.convolution import Swish
+def conv3x3(in_planes, out_planes, stride=1):
+    """conv3x3.
+    :param in_planes: int, number of channels in the input sequence.
+    :param out_planes: int,  number of channels produced by the convolution.
+    :param stride: int, size of the convolving kernel.
+    """
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False,
+    )
+def downsample_basic_block(inplanes, outplanes, stride):
+    """downsample_basic_block.
+    :param inplanes: int, number of channels in the input sequence.
+    :param outplanes: int, number of channels produced by the convolution.
+    :param stride: int, size of the convolving kernel.
+    """
+    return  nn.Sequential(
+        nn.Conv2d(
+            inplanes,
+            outplanes,
+            kernel_size=1,
+            stride=stride,
+            bias=False,
+            ),
+        nn.BatchNorm2d(outplanes),
+    )
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        relu_type="swish",
+    ):
+        """__init__.
+        :param inplanes: int, number of channels in the input sequence.
+        :param planes: int,  number of channels produced by the convolution.
+        :param stride: int, size of the convolving kernel.
+        :param downsample: boolean, if True, the temporal resolution is downsampled.
+        :param relu_type: str, type of activation function.
+        """
+        super(BasicBlock, self).__init__()
+        assert relu_type in ["relu", "prelu", "swish"]
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        if relu_type == "relu":
+            self.relu1 = nn.ReLU(inplace=True)
+            self.relu2 = nn.ReLU(inplace=True)
+        elif relu_type == "prelu":
+            self.relu1 = nn.PReLU(num_parameters=planes)
+            self.relu2 = nn.PReLU(num_parameters=planes)
+        elif relu_type == "swish":
+            self.relu1 = Swish()
+            self.relu2 = Swish()
+        else:
+            raise NotImplementedError
+        # --------
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        """forward.
+        :param x: torch.Tensor, input tensor with input size (B, C, T, H, W).
+        """
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu2(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(
+        self,
+        block,
+        layers,
+        relu_type="swish",
+    ):
+        super(ResNet, self).__init__()
+        self.inplanes = 64
+        self.relu_type = relu_type
+        self.downsample_block = downsample_basic_block
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+    def _make_layer(self, block, planes, blocks, stride=1):
+        """_make_layer.
+        :param block: torch.nn.Module, class of blocks.
+        :param planes: int,  number of channels produced by the convolution.
+        :param blocks: int, number of layers in a block.
+        :param stride: int, size of the convolving kernel.
+        """
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = self.downsample_block(
+                inplanes=self.inplanes,
+                outplanes=planes*block.expansion,
+                stride=stride,
+            )
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                relu_type=self.relu_type,
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    relu_type=self.relu_type,
+                )
+            )
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        """forward.
+        :param x: torch.Tensor, input tensor with input size (B, C, T, H, W).
+        """
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        return x

espnet/nets/pytorch_backend/backbones/modules/resnet1d.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import math
+import torch.nn as nn
+import pdb
+from espnet.nets.pytorch_backend.transformer.convolution import Swish
+def conv3x3(in_planes, out_planes, stride=1):
+    """conv3x3.
+    :param in_planes: int, number of channels in the input sequence.
+    :param out_planes: int,  number of channels produced by the convolution.
+    :param stride: int, size of the convolving kernel.
+    """
+    return nn.Conv1d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False,
+    )
+def downsample_basic_block(inplanes, outplanes, stride):
+    """downsample_basic_block.
+    :param inplanes: int, number of channels in the input sequence.
+    :param outplanes: int, number of channels produced by the convolution.
+    :param stride: int, size of the convolving kernel.
+    """
+    return  nn.Sequential(
+        nn.Conv1d(
+            inplanes,
+            outplanes,
+            kernel_size=1,
+            stride=stride,
+            bias=False,
+        ),
+        nn.BatchNorm1d(outplanes),
+    )
+class BasicBlock1D(nn.Module):
+    expansion = 1
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        relu_type="relu",
+    ):
+        """__init__.
+        :param inplanes: int, number of channels in the input sequence.
+        :param planes: int,  number of channels produced by the convolution.
+        :param stride: int, size of the convolving kernel.
+        :param downsample: boolean, if True, the temporal resolution is downsampled.
+        :param relu_type: str, type of activation function.
+        """
+        super(BasicBlock1D, self).__init__()
+        assert relu_type in ["relu","prelu", "swish"]
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm1d(planes)
+        # type of ReLU is an input option
+        if relu_type == "relu":
+            self.relu1 = nn.ReLU(inplace=True)
+            self.relu2 = nn.ReLU(inplace=True)
+        elif relu_type == "prelu":
+            self.relu1 = nn.PReLU(num_parameters=planes)
+            self.relu2 = nn.PReLU(num_parameters=planes)
+        elif relu_type == "swish":
+            self.relu1 = Swish()
+            self.relu2 = Swish()
+        else:
+            raise NotImplementedError
+        # --------
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm1d(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        """forward.
+        :param x: torch.Tensor, input tensor with input size (B, C, T)
+        """
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu2(out)
+        return out
+class ResNet1D(nn.Module):
+    def __init__(self,
+        block,
+        layers,
+        relu_type="swish",
+        a_upsample_ratio=1,
+    ):
+        """__init__.
+        :param block: torch.nn.Module, class of blocks.
+        :param layers: List, customised layers in each block.
+        :param relu_type: str, type of activation function.
+        :param a_upsample_ratio: int, The ratio related to the \
+            temporal resolution of output features of the frontend. \
+            a_upsample_ratio=1 produce features with a fps of 25.
+        """
+        super(ResNet1D, self).__init__()
+        self.inplanes = 64
+        self.relu_type = relu_type
+        self.downsample_block = downsample_basic_block
+        self.a_upsample_ratio = a_upsample_ratio
+        self.conv1 = nn.Conv1d(
+            in_channels=1,
+            out_channels=self.inplanes,
+            kernel_size=80,
+            stride=4,
+            padding=38,
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm1d(self.inplanes)
+        if relu_type == "relu":
+            self.relu = nn.ReLU(inplace=True)
+        elif relu_type == "prelu":
+            self.relu = nn.PReLU(num_parameters=self.inplanes)
+        elif relu_type == "swish":
+            self.relu = Swish()
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool1d(
+            kernel_size=20//self.a_upsample_ratio,
+            stride=20//self.a_upsample_ratio,
+        )
+    def _make_layer(self, block, planes, blocks, stride=1):
+        """_make_layer.
+        :param block: torch.nn.Module, class of blocks.
+        :param planes: int,  number of channels produced by the convolution.
+        :param blocks: int, number of layers in a block.
+        :param stride: int, size of the convolving kernel.
+        """
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = self.downsample_block(
+                inplanes=self.inplanes,
+                outplanes=planes*block.expansion,
+                stride=stride,
+            )
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                relu_type=self.relu_type,
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    relu_type=self.relu_type,
+                )
+            )
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        """forward.
+        :param x: torch.Tensor, input tensor with input size (B, C, T)
+        """
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        return x

espnet/nets/pytorch_backend/backbones/modules/shufflenetv2.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from collections import OrderedDict
+from torch.nn import init
+import math
+import pdb
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU(inplace=True)
+    )
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU(inplace=True)
+    )
+def channel_shuffle(x, groups):
+    batchsize, num_channels, height, width = x.data.size()
+    channels_per_group = num_channels // groups
+    # reshape
+    x = x.view(batchsize, groups,
+        channels_per_group, height, width)
+    x = torch.transpose(x, 1, 2).contiguous()
+    # flatten
+    x = x.view(batchsize, -1, height, width)
+    return x
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, benchmodel):
+        super(InvertedResidual, self).__init__()
+        self.benchmodel = benchmodel
+        self.stride = stride
+        assert stride in [1, 2]
+        oup_inc = oup//2
+        if self.benchmodel == 1:
+            #assert inp == oup_inc
+            self.banch2 = nn.Sequential(
+                # pw
+                nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                nn.ReLU(inplace=True),
+                # dw
+                nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                # pw-linear
+                nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                nn.ReLU(inplace=True),
+            )
+        else:
+            self.banch1 = nn.Sequential(
+                # dw
+                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+                nn.BatchNorm2d(inp),
+                # pw-linear
+                nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                nn.ReLU(inplace=True),
+            )
+            self.banch2 = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                nn.ReLU(inplace=True),
+                # dw
+                nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                # pw-linear
+                nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup_inc),
+                nn.ReLU(inplace=True),
+            )
+    @staticmethod
+    def _concat(x, out):
+        # concatenate along channel axis
+        return torch.cat((x, out), 1)
+    def forward(self, x):
+        if 1==self.benchmodel:
+            x1 = x[:, :(x.shape[1]//2), :, :]
+            x2 = x[:, (x.shape[1]//2):, :, :]
+            out = self._concat(x1, self.banch2(x2))
+        elif 2==self.benchmodel:
+            out = self._concat(self.banch1(x), self.banch2(x))
+        return channel_shuffle(out, 2)
+class ShuffleNetV2(nn.Module):
+    def __init__(self, n_class=1000, input_size=224, width_mult=2.):
+        super(ShuffleNetV2, self).__init__()
+        assert input_size % 32 == 0, "Input size needs to be divisible by 32"
+        self.stage_repeats = [4, 8, 4]
+        # index 0 is invalid and should never be called.
+        # only used for indexing convenience.
+        if width_mult == 0.5:
+            self.stage_out_channels = [-1, 24,  48,  96, 192, 1024]
+        elif width_mult == 1.0:
+            self.stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif width_mult == 1.5:
+            self.stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif width_mult == 2.0:
+            self.stage_out_channels = [-1, 24, 244, 488, 976, 2048]
+        else:
+            raise ValueError(
+                """Width multiplier should be in [0.5, 1.0, 1.5, 2.0]. Current value: {}""".format(width_mult))
+        # building first layer
+        input_channel = self.stage_out_channels[1]
+        self.conv1 = conv_bn(3, input_channel, 2)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.features = []
+        # building inverted residual blocks
+        for idxstage in range(len(self.stage_repeats)):
+            numrepeat = self.stage_repeats[idxstage]
+            output_channel = self.stage_out_channels[idxstage+2]
+            for i in range(numrepeat):
+                if i == 0:
+                #inp, oup, stride, benchmodel):
+                    self.features.append(InvertedResidual(input_channel, output_channel, 2, 2))
+                else:
+                    self.features.append(InvertedResidual(input_channel, output_channel, 1, 1))
+                input_channel = output_channel
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+        # building last several layers
+        self.conv_last  = conv_1x1_bn(input_channel, self.stage_out_channels[-1])
+        self.globalpool = nn.Sequential(nn.AvgPool2d(int(input_size/32)))
+        # building classifier
+        self.classifier = nn.Sequential(nn.Linear(self.stage_out_channels[-1], n_class))
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        x = self.features(x)
+        x = self.conv_last(x)
+        x = self.globalpool(x)
+        x = x.view(-1, self.stage_out_channels[-1])
+        x = self.classifier(x)
+        return x

espnet/nets/pytorch_backend/ctc.py ADDED Viewed

	@@ -0,0 +1,283 @@

+from distutils.version import LooseVersion
+import logging
+import numpy as np
+import six
+import torch
+import torch.nn.functional as F
+from espnet.nets.pytorch_backend.nets_utils import to_device
+class CTC(torch.nn.Module):
+    """CTC module
+    :param int odim: dimension of outputs
+    :param int eprojs: number of encoder projection units
+    :param float dropout_rate: dropout rate (0.0 ~ 1.0)
+    :param str ctc_type: builtin or warpctc
+    :param bool reduce: reduce the CTC loss into a scalar
+    """
+    def __init__(self, odim, eprojs, dropout_rate, ctc_type="warpctc", reduce=True):
+        super().__init__()
+        self.dropout_rate = dropout_rate
+        self.loss = None
+        self.ctc_lo = torch.nn.Linear(eprojs, odim)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.probs = None  # for visualization
+        # In case of Pytorch >= 1.7.0, CTC will be always builtin
+        self.ctc_type = (
+            ctc_type
+            if LooseVersion(torch.__version__) < LooseVersion("1.7.0")
+            else "builtin"
+        )
+        if self.ctc_type == "builtin":
+            reduction_type = "sum" if reduce else "none"
+            self.ctc_loss = torch.nn.CTCLoss(
+                reduction=reduction_type, zero_infinity=True
+            )
+        elif self.ctc_type == "cudnnctc":
+            reduction_type = "sum" if reduce else "none"
+            self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)
+        elif self.ctc_type == "warpctc":
+            import warpctc_pytorch as warp_ctc
+            self.ctc_loss = warp_ctc.CTCLoss(size_average=True, reduce=reduce)
+        elif self.ctc_type == "gtnctc":
+            from espnet.nets.pytorch_backend.gtn_ctc import GTNCTCLossFunction
+            self.ctc_loss = GTNCTCLossFunction.apply
+        else:
+            raise ValueError(
+                'ctc_type must be "builtin" or "warpctc": {}'.format(self.ctc_type)
+            )
+        self.ignore_id = -1
+        self.reduce = reduce
+    def loss_fn(self, th_pred, th_target, th_ilen, th_olen):
+        if self.ctc_type in ["builtin", "cudnnctc"]:
+            th_pred = th_pred.log_softmax(2)
+            # Use the deterministic CuDNN implementation of CTC loss to avoid
+            #  [issue#17798](https://github.com/pytorch/pytorch/issues/17798)
+            with torch.backends.cudnn.flags(deterministic=True):
+                loss = self.ctc_loss(th_pred, th_target, th_ilen, th_olen)
+            # Batch-size average
+            loss = loss / th_pred.size(1)
+            return loss
+        elif self.ctc_type == "warpctc":
+            return self.ctc_loss(th_pred, th_target, th_ilen, th_olen)
+        elif self.ctc_type == "gtnctc":
+            targets = [t.tolist() for t in th_target]
+            log_probs = torch.nn.functional.log_softmax(th_pred, dim=2)
+            return self.ctc_loss(log_probs, targets, th_ilen, 0, "none")
+        else:
+            raise NotImplementedError
+    def forward(self, hs_pad, hlens, ys_pad):
+        """CTC forward
+        :param torch.Tensor hs_pad: batch of padded hidden state sequences (B, Tmax, D)
+        :param torch.Tensor hlens: batch of lengths of hidden state sequences (B)
+        :param torch.Tensor ys_pad:
+            batch of padded character id sequence tensor (B, Lmax)
+        :return: ctc loss value
+        :rtype: torch.Tensor
+        """
+        # TODO(kan-bayashi): need to make more smart way
+        ys = [y[y != self.ignore_id] for y in ys_pad]  # parse padded ys
+        # zero padding for hs
+        ys_hat = self.ctc_lo(self.dropout(hs_pad))
+        if self.ctc_type != "gtnctc":
+            ys_hat = ys_hat.transpose(0, 1)
+        if self.ctc_type == "builtin":
+            olens = to_device(ys_hat, torch.LongTensor([len(s) for s in ys]))
+            hlens = hlens.long()
+            ys_pad = torch.cat(ys)  # without this the code breaks for asr_mix
+            self.loss = self.loss_fn(ys_hat, ys_pad, hlens, olens)
+        else:
+            self.loss = None
+            hlens = torch.from_numpy(np.fromiter(hlens, dtype=np.int32))
+            olens = torch.from_numpy(
+                np.fromiter((x.size(0) for x in ys), dtype=np.int32)
+            )
+            # zero padding for ys
+            ys_true = torch.cat(ys).cpu().int()  # batch x olen
+            # get ctc loss
+            # expected shape of seqLength x batchSize x alphabet_size
+            dtype = ys_hat.dtype
+            if self.ctc_type == "warpctc" or dtype == torch.float16:
+                # warpctc only supports float32
+                # torch.ctc does not support float16 (#1751)
+                ys_hat = ys_hat.to(dtype=torch.float32)
+            if self.ctc_type == "cudnnctc":
+                # use GPU when using the cuDNN implementation
+                ys_true = to_device(hs_pad, ys_true)
+            if self.ctc_type == "gtnctc":
+                # keep as list for gtn
+                ys_true = ys
+            self.loss = to_device(
+                hs_pad, self.loss_fn(ys_hat, ys_true, hlens, olens)
+            ).to(dtype=dtype)
+        # get length info
+        logging.info(
+            self.__class__.__name__
+            + " input lengths:  "
+            + "".join(str(hlens).split("\n"))
+        )
+        logging.info(
+            self.__class__.__name__
+            + " output lengths: "
+            + "".join(str(olens).split("\n"))
+        )
+        if self.reduce:
+            # NOTE: sum() is needed to keep consistency
+            # since warpctc return as tensor w/ shape (1,)
+            # but builtin return as tensor w/o shape (scalar).
+            self.loss = self.loss.sum()
+            logging.info("ctc loss:" + str(float(self.loss)))
+        return self.loss
+    def softmax(self, hs_pad):
+        """softmax of frame activations
+        :param torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
+        :return: log softmax applied 3d tensor (B, Tmax, odim)
+        :rtype: torch.Tensor
+        """
+        self.probs = F.softmax(self.ctc_lo(hs_pad), dim=2)
+        return self.probs
+    def log_softmax(self, hs_pad):
+        """log_softmax of frame activations
+        :param torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
+        :return: log softmax applied 3d tensor (B, Tmax, odim)
+        :rtype: torch.Tensor
+        """
+        return F.log_softmax(self.ctc_lo(hs_pad), dim=2)
+    def argmax(self, hs_pad):
+        """argmax of frame activations
+        :param torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
+        :return: argmax applied 2d tensor (B, Tmax)
+        :rtype: torch.Tensor
+        """
+        return torch.argmax(self.ctc_lo(hs_pad), dim=2)
+    def forced_align(self, h, y, blank_id=0):
+        """forced alignment.
+        :param torch.Tensor h: hidden state sequence, 2d tensor (T, D)
+        :param torch.Tensor y: id sequence tensor 1d tensor (L)
+        :param int y: blank symbol index
+        :return: best alignment results
+        :rtype: list
+        """
+        def interpolate_blank(label, blank_id=0):
+            """Insert blank token between every two label token."""
+            label = np.expand_dims(label, 1)
+            blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id
+            label = np.concatenate([blanks, label], axis=1)
+            label = label.reshape(-1)
+            label = np.append(label, label[0])
+            return label
+        lpz = self.log_softmax(h)
+        lpz = lpz.squeeze(0)
+        y_int = interpolate_blank(y, blank_id)
+        logdelta = np.zeros((lpz.size(0), len(y_int))) - 100000000000.0  # log of zero
+        state_path = (
+            np.zeros((lpz.size(0), len(y_int)), dtype=np.int16) - 1
+        )  # state path
+        logdelta[0, 0] = lpz[0][y_int[0]]
+        logdelta[0, 1] = lpz[0][y_int[1]]
+        for t in six.moves.range(1, lpz.size(0)):
+            for s in six.moves.range(len(y_int)):
+                if y_int[s] == blank_id or s < 2 or y_int[s] == y_int[s - 2]:
+                    candidates = np.array([logdelta[t - 1, s], logdelta[t - 1, s - 1]])
+                    prev_state = [s, s - 1]
+                else:
+                    candidates = np.array(
+                        [
+                            logdelta[t - 1, s],
+                            logdelta[t - 1, s - 1],
+                            logdelta[t - 1, s - 2],
+                        ]
+                    )
+                    prev_state = [s, s - 1, s - 2]
+                logdelta[t, s] = np.max(candidates) + lpz[t][y_int[s]]
+                state_path[t, s] = prev_state[np.argmax(candidates)]
+        state_seq = -1 * np.ones((lpz.size(0), 1), dtype=np.int16)
+        candidates = np.array(
+            [logdelta[-1, len(y_int) - 1], logdelta[-1, len(y_int) - 2]]
+        )
+        prev_state = [len(y_int) - 1, len(y_int) - 2]
+        state_seq[-1] = prev_state[np.argmax(candidates)]
+        for t in six.moves.range(lpz.size(0) - 2, -1, -1):
+            state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]]
+        output_state_seq = []
+        for t in six.moves.range(0, lpz.size(0)):
+            output_state_seq.append(y_int[state_seq[t, 0]])
+        return output_state_seq
+def ctc_for(args, odim, reduce=True):
+    """Returns the CTC module for the given args and output dimension
+    :param Namespace args: the program args
+    :param int odim : The output dimension
+    :param bool reduce : return the CTC loss in a scalar
+    :return: the corresponding CTC module
+    """
+    num_encs = getattr(args, "num_encs", 1)  # use getattr to keep compatibility
+    if num_encs == 1:
+        # compatible with single encoder asr mode
+        return CTC(
+            odim, args.eprojs, args.dropout_rate, ctc_type=args.ctc_type, reduce=reduce
+        )
+    elif num_encs >= 1:
+        ctcs_list = torch.nn.ModuleList()
+        if args.share_ctc:
+            # use dropout_rate of the first encoder
+            ctc = CTC(
+                odim,
+                args.eprojs,
+                args.dropout_rate[0],
+                ctc_type=args.ctc_type,
+                reduce=reduce,
+            )
+            ctcs_list.append(ctc)
+        else:
+            for idx in range(num_encs):
+                ctc = CTC(
+                    odim,
+                    args.eprojs,
+                    args.dropout_rate[idx],
+                    ctc_type=args.ctc_type,
+                    reduce=reduce,
+                )
+                ctcs_list.append(ctc)
+        return ctcs_list
+    else:
+        raise ValueError(
+            "Number of encoders needs to be more than one. {}".format(num_encs)
+        )

espnet/nets/pytorch_backend/e2e_asr_transformer.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Transformer speech recognition model (pytorch)."""
+from argparse import Namespace
+from distutils.util import strtobool
+import logging
+import math
+import numpy
+import torch
+from espnet.nets.ctc_prefix_score import CTCPrefixScore
+from espnet.nets.e2e_asr_common import end_detect
+from espnet.nets.e2e_asr_common import ErrorCalculator
+from espnet.nets.pytorch_backend.ctc import CTC
+from espnet.nets.pytorch_backend.nets_utils import get_subsample
+from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
+from espnet.nets.pytorch_backend.nets_utils import th_accuracy
+from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
+from espnet.nets.pytorch_backend.transformer.attention import (
+    MultiHeadedAttention,  # noqa: H301
+    RelPositionMultiHeadedAttention,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.decoder import Decoder
+from espnet.nets.pytorch_backend.transformer.encoder import Encoder
+from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
+    LabelSmoothingLoss,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
+from espnet.nets.pytorch_backend.transformer.mask import target_mask
+from espnet.nets.scorers.ctc import CTCPrefixScorer
+class E2E(torch.nn.Module):
+    """E2E module.
+    :param int idim: dimension of inputs
+    :param int odim: dimension of outputs
+    :param Namespace args: argument Namespace containing options
+    """
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments."""
+        group = parser.add_argument_group("transformer model setting")
+        group.add_argument(
+            "--transformer-init",
+            type=str,
+            default="pytorch",
+            choices=[
+                "pytorch",
+                "xavier_uniform",
+                "xavier_normal",
+                "kaiming_uniform",
+                "kaiming_normal",
+            ],
+            help="how to initialize transformer parameters",
+        )
+        group.add_argument(
+            "--transformer-input-layer",
+            type=str,
+            default="conv2d",
+            choices=["conv3d", "conv2d", "conv1d", "linear", "embed"],
+            help="transformer input layer type",
+        )
+        group.add_argument(
+            "--transformer-encoder-attn-layer-type",
+            type=str,
+            default="mha",
+            choices=["mha", "rel_mha", "legacy_rel_mha"],
+            help="transformer encoder attention layer type",
+        )
+        group.add_argument(
+            "--transformer-attn-dropout-rate",
+            default=None,
+            type=float,
+            help="dropout in transformer attention. use --dropout-rate if None is set",
+        )
+        group.add_argument(
+            "--transformer-lr",
+            default=10.0,
+            type=float,
+            help="Initial value of learning rate",
+        )
+        group.add_argument(
+            "--transformer-warmup-steps",
+            default=25000,
+            type=int,
+            help="optimizer warmup steps",
+        )
+        group.add_argument(
+            "--transformer-length-normalized-loss",
+            default=True,
+            type=strtobool,
+            help="normalize loss by length",
+        )
+        group.add_argument(
+            "--dropout-rate",
+            default=0.0,
+            type=float,
+            help="Dropout rate for the encoder",
+        )
+        group.add_argument(
+            "--macaron-style",
+            default=False,
+            type=strtobool,
+            help="Whether to use macaron style for positionwise layer",
+        )
+        # -- input
+        group.add_argument(
+            "--a-upsample-ratio",
+            default=1,
+            type=int,
+            help="Upsample rate for audio",
+        )
+        group.add_argument(
+            "--relu-type",
+            default="swish",
+            type=str,
+            help="the type of activation layer",
+        )
+        # Encoder
+        group.add_argument(
+            "--elayers",
+            default=4,
+            type=int,
+            help="Number of encoder layers (for shared recognition part "
+            "in multi-speaker asr mode)",
+        )
+        group.add_argument(
+            "--eunits",
+            "-u",
+            default=300,
+            type=int,
+            help="Number of encoder hidden units",
+        )
+        group.add_argument(
+            "--use-cnn-module",
+            default=False,
+            type=strtobool,
+            help="Use convolution module or not",
+        )
+        group.add_argument(
+            "--cnn-module-kernel",
+            default=31,
+            type=int,
+            help="Kernel size of convolution module.",
+        )
+        # Attention
+        group.add_argument(
+            "--adim",
+            default=320,
+            type=int,
+            help="Number of attention transformation dimensions",
+        )
+        group.add_argument(
+            "--aheads",
+            default=4,
+            type=int,
+            help="Number of heads for multi head attention",
+        )
+        group.add_argument(
+            "--zero-triu",
+            default=False,
+            type=strtobool,
+            help="If true, zero the uppper triangular part of attention matrix.",
+        )
+        # Relative positional encoding
+        group.add_argument(
+            "--rel-pos-type",
+            type=str,
+            default="legacy",
+            choices=["legacy", "latest"],
+            help="Whether to use the latest relative positional encoding or the legacy one."
+            "The legacy relative positional encoding will be deprecated in the future."
+            "More Details can be found in https://github.com/espnet/espnet/pull/2816.",
+        )
+        # Decoder
+        group.add_argument(
+            "--dlayers", default=1, type=int, help="Number of decoder layers"
+        )
+        group.add_argument(
+            "--dunits", default=320, type=int, help="Number of decoder hidden units"
+        )
+        # -- pretrain
+        group.add_argument("--pretrain-dataset",
+            default="",
+            type=str,
+            help='pre-trained dataset for encoder'
+        )
+        # -- custom name
+        group.add_argument("--custom-pretrain-name",
+            default="",
+            type=str,
+            help='pre-trained model for encoder'
+        )
+        return parser
+    @property
+    def attention_plot_class(self):
+        """Return PlotAttentionReport."""
+        return PlotAttentionReport
+    def __init__(self, odim, args, ignore_id=-1):
+        """Construct an E2E object.
+        :param int odim: dimension of outputs
+        :param Namespace args: argument Namespace containing options
+        """
+        torch.nn.Module.__init__(self)
+        if args.transformer_attn_dropout_rate is None:
+            args.transformer_attn_dropout_rate = args.dropout_rate
+        # Check the relative positional encoding type
+        self.rel_pos_type = getattr(args, "rel_pos_type", None)
+        if self.rel_pos_type is None and args.transformer_encoder_attn_layer_type == "rel_mha":
+            args.transformer_encoder_attn_layer_type = "legacy_rel_mha"
+            logging.warning(
+                "Using legacy_rel_pos and it will be deprecated in the future."
+            )
+        idim = 80
+        self.encoder = Encoder(
+            idim=idim,
+            attention_dim=args.adim,
+            attention_heads=args.aheads,
+            linear_units=args.eunits,
+            num_blocks=args.elayers,
+            input_layer=args.transformer_input_layer,
+            dropout_rate=args.dropout_rate,
+            positional_dropout_rate=args.dropout_rate,
+            attention_dropout_rate=args.transformer_attn_dropout_rate,
+            encoder_attn_layer_type=args.transformer_encoder_attn_layer_type,
+            macaron_style=args.macaron_style,
+            use_cnn_module=args.use_cnn_module,
+            cnn_module_kernel=args.cnn_module_kernel,
+            zero_triu=getattr(args, "zero_triu", False),
+            a_upsample_ratio=args.a_upsample_ratio,
+            relu_type=getattr(args, "relu_type", "swish"),
+        )
+        self.transformer_input_layer = args.transformer_input_layer
+        self.a_upsample_ratio = args.a_upsample_ratio
+        if args.mtlalpha < 1:
+            self.decoder = Decoder(
+                odim=odim,
+                attention_dim=args.adim,
+                attention_heads=args.aheads,
+                linear_units=args.dunits,
+                num_blocks=args.dlayers,
+                dropout_rate=args.dropout_rate,
+                positional_dropout_rate=args.dropout_rate,
+                self_attention_dropout_rate=args.transformer_attn_dropout_rate,
+                src_attention_dropout_rate=args.transformer_attn_dropout_rate,
+            )
+        else:
+            self.decoder = None
+        self.blank = 0
+        self.sos = odim - 1
+        self.eos = odim - 1
+        self.odim = odim
+        self.ignore_id = ignore_id
+        self.subsample = get_subsample(args, mode="asr", arch="transformer")
+        # self.lsm_weight = a
+        self.criterion = LabelSmoothingLoss(
+            self.odim,
+            self.ignore_id,
+            args.lsm_weight,
+            args.transformer_length_normalized_loss,
+        )
+        self.adim = args.adim
+        self.mtlalpha = args.mtlalpha
+        if args.mtlalpha > 0.0:
+            self.ctc = CTC(
+                odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True
+            )
+        else:
+            self.ctc = None
+        if args.report_cer or args.report_wer:
+            self.error_calculator = ErrorCalculator(
+                args.char_list,
+                args.sym_space,
+                args.sym_blank,
+                args.report_cer,
+                args.report_wer,
+            )
+        else:
+            self.error_calculator = None
+        self.rnnlm = None
+    def scorers(self):
+        """Scorers."""
+        return dict(decoder=self.decoder, ctc=CTCPrefixScorer(self.ctc, self.eos))
+    def encode(self, x, extract_resnet_feats=False):
+        """Encode acoustic features.
+        :param ndarray x: source acoustic feature (T, D)
+        :return: encoder outputs
+        :rtype: torch.Tensor
+        """
+        self.eval()
+        x = torch.as_tensor(x).unsqueeze(0)
+        if extract_resnet_feats:
+            resnet_feats = self.encoder(
+                x,
+                None,
+                extract_resnet_feats=extract_resnet_feats,
+            )
+            return resnet_feats.squeeze(0)
+        else:
+            enc_output, _ = self.encoder(x, None)
+            return enc_output.squeeze(0)

espnet/nets/pytorch_backend/e2e_asr_transformer_av.py ADDED Viewed

	@@ -0,0 +1,352 @@

+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Transformer speech recognition model (pytorch)."""
+from argparse import Namespace
+from distutils.util import strtobool
+import logging
+import math
+import numpy
+import torch
+from espnet.nets.ctc_prefix_score import CTCPrefixScore
+from espnet.nets.e2e_asr_common import end_detect
+from espnet.nets.e2e_asr_common import ErrorCalculator
+from espnet.nets.pytorch_backend.ctc import CTC
+from espnet.nets.pytorch_backend.nets_utils import get_subsample
+from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
+from espnet.nets.pytorch_backend.nets_utils import th_accuracy
+from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
+from espnet.nets.pytorch_backend.transformer.attention import (
+    MultiHeadedAttention,  # noqa: H301
+    RelPositionMultiHeadedAttention,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.decoder import Decoder
+from espnet.nets.pytorch_backend.transformer.encoder import Encoder
+from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
+    LabelSmoothingLoss,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
+from espnet.nets.pytorch_backend.transformer.mask import target_mask
+from espnet.nets.scorers.ctc import CTCPrefixScorer
+from espnet.nets.pytorch_backend.nets_utils import MLPHead
+class E2E(torch.nn.Module):
+    """E2E module.
+    :param int idim: dimension of inputs
+    :param int odim: dimension of outputs
+    :param Namespace args: argument Namespace containing options
+    """
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments."""
+        group = parser.add_argument_group("transformer model setting")
+        group.add_argument(
+            "--transformer-init",
+            type=str,
+            default="pytorch",
+            choices=[
+                "pytorch",
+                "xavier_uniform",
+                "xavier_normal",
+                "kaiming_uniform",
+                "kaiming_normal",
+            ],
+            help="how to initialize transformer parameters",
+        )
+        group.add_argument(
+            "--transformer-input-layer",
+            type=str,
+            default="conv2d",
+            choices=["conv3d", "conv2d", "conv1d", "linear", "embed"],
+            help="transformer input layer type",
+        )
+        group.add_argument(
+            "--transformer-encoder-attn-layer-type",
+            type=str,
+            default="mha",
+            choices=["mha", "rel_mha", "legacy_rel_mha"],
+            help="transformer encoder attention layer type",
+        )
+        group.add_argument(
+            "--transformer-attn-dropout-rate",
+            default=None,
+            type=float,
+            help="dropout in transformer attention. use --dropout-rate if None is set",
+        )
+        group.add_argument(
+            "--transformer-lr",
+            default=10.0,
+            type=float,
+            help="Initial value of learning rate",
+        )
+        group.add_argument(
+            "--transformer-warmup-steps",
+            default=25000,
+            type=int,
+            help="optimizer warmup steps",
+        )
+        group.add_argument(
+            "--transformer-length-normalized-loss",
+            default=True,
+            type=strtobool,
+            help="normalize loss by length",
+        )
+        group.add_argument(
+            "--dropout-rate",
+            default=0.0,
+            type=float,
+            help="Dropout rate for the encoder",
+        )
+        group.add_argument(
+            "--macaron-style",
+            default=False,
+            type=strtobool,
+            help="Whether to use macaron style for positionwise layer",
+        )
+        # -- input
+        group.add_argument(
+            "--a-upsample-ratio",
+            default=1,
+            type=int,
+            help="Upsample rate for audio",
+        )
+        group.add_argument(
+            "--relu-type",
+            default="swish",
+            type=str,
+            help="the type of activation layer",
+        )
+        # Encoder
+        group.add_argument(
+            "--elayers",
+            default=4,
+            type=int,
+            help="Number of encoder layers (for shared recognition part "
+            "in multi-speaker asr mode)",
+        )
+        group.add_argument(
+            "--eunits",
+            "-u",
+            default=300,
+            type=int,
+            help="Number of encoder hidden units",
+        )
+        group.add_argument(
+            "--use-cnn-module",
+            default=False,
+            type=strtobool,
+            help="Use convolution module or not",
+        )
+        group.add_argument(
+            "--cnn-module-kernel",
+            default=31,
+            type=int,
+            help="Kernel size of convolution module.",
+        )
+        # Attention
+        group.add_argument(
+            "--adim",
+            default=320,
+            type=int,
+            help="Number of attention transformation dimensions",
+        )
+        group.add_argument(
+            "--aheads",
+            default=4,
+            type=int,
+            help="Number of heads for multi head attention",
+        )
+        group.add_argument(
+            "--zero-triu",
+            default=False,
+            type=strtobool,
+            help="If true, zero the uppper triangular part of attention matrix.",
+        )
+        # Relative positional encoding
+        group.add_argument(
+            "--rel-pos-type",
+            type=str,
+            default="legacy",
+            choices=["legacy", "latest"],
+            help="Whether to use the latest relative positional encoding or the legacy one."
+            "The legacy relative positional encoding will be deprecated in the future."
+            "More Details can be found in https://github.com/espnet/espnet/pull/2816.",
+        )
+        # Decoder
+        group.add_argument(
+            "--dlayers", default=1, type=int, help="Number of decoder layers"
+        )
+        group.add_argument(
+            "--dunits", default=320, type=int, help="Number of decoder hidden units"
+        )
+        # -- pretrain
+        group.add_argument("--pretrain-dataset",
+            default="",
+            type=str,
+            help='pre-trained dataset for encoder'
+        )
+        # -- custom name
+        group.add_argument("--custom-pretrain-name",
+            default="",
+            type=str,
+            help='pre-trained model for encoder'
+        )
+        return parser
+    @property
+    def attention_plot_class(self):
+        """Return PlotAttentionReport."""
+        return PlotAttentionReport
+    def __init__(self, odim, args, ignore_id=-1):
+        """Construct an E2E object.
+        :param int odim: dimension of outputs
+        :param Namespace args: argument Namespace containing options
+        """
+        torch.nn.Module.__init__(self)
+        if args.transformer_attn_dropout_rate is None:
+            args.transformer_attn_dropout_rate = args.dropout_rate
+        # Check the relative positional encoding type
+        self.rel_pos_type = getattr(args, "rel_pos_type", None)
+        if self.rel_pos_type is None and args.transformer_encoder_attn_layer_type == "rel_mha":
+            args.transformer_encoder_attn_layer_type = "legacy_rel_mha"
+            logging.warning(
+                "Using legacy_rel_pos and it will be deprecated in the future."
+            )
+        idim = 80
+        self.encoder = Encoder(
+            idim=idim,
+            attention_dim=args.adim,
+            attention_heads=args.aheads,
+            linear_units=args.eunits,
+            num_blocks=args.elayers,
+            input_layer=args.transformer_input_layer,
+            dropout_rate=args.dropout_rate,
+            positional_dropout_rate=args.dropout_rate,
+            attention_dropout_rate=args.transformer_attn_dropout_rate,
+            encoder_attn_layer_type=args.transformer_encoder_attn_layer_type,
+            macaron_style=args.macaron_style,
+            use_cnn_module=args.use_cnn_module,
+            cnn_module_kernel=args.cnn_module_kernel,
+            zero_triu=getattr(args, "zero_triu", False),
+            a_upsample_ratio=args.a_upsample_ratio,
+            relu_type=getattr(args, "relu_type", "swish"),
+        )
+        self.transformer_input_layer = args.transformer_input_layer
+        self.a_upsample_ratio = args.a_upsample_ratio
+        self.aux_encoder = Encoder(
+            idim=idim,
+            attention_dim=args.aux_adim,
+            attention_heads=args.aux_aheads,
+            linear_units=args.aux_eunits,
+            num_blocks=args.aux_elayers,
+            input_layer=args.aux_transformer_input_layer,
+            dropout_rate=args.aux_dropout_rate,
+            positional_dropout_rate=args.aux_dropout_rate,
+            attention_dropout_rate=args.aux_transformer_attn_dropout_rate,
+            encoder_attn_layer_type=args.aux_transformer_encoder_attn_layer_type,
+            macaron_style=args.aux_macaron_style,
+            use_cnn_module=args.aux_use_cnn_module,
+            cnn_module_kernel=args.aux_cnn_module_kernel,
+            zero_triu=getattr(args, "aux_zero_triu", False),
+            a_upsample_ratio=args.aux_a_upsample_ratio,
+            relu_type=getattr(args, "aux_relu_type", "swish"),
+        )
+        self.aux_transformer_input_layer = args.aux_transformer_input_layer
+        self.fusion = MLPHead(
+            idim=args.adim + args.aux_adim,
+            hdim=args.fusion_hdim,
+            odim=args.adim,
+            norm=args.fusion_norm,
+        )
+        if args.mtlalpha < 1:
+            self.decoder = Decoder(
+                odim=odim,
+                attention_dim=args.adim,
+                attention_heads=args.aheads,
+                linear_units=args.dunits,
+                num_blocks=args.dlayers,
+                dropout_rate=args.dropout_rate,
+                positional_dropout_rate=args.dropout_rate,
+                self_attention_dropout_rate=args.transformer_attn_dropout_rate,
+                src_attention_dropout_rate=args.transformer_attn_dropout_rate,
+            )
+        else:
+            self.decoder = None
+        self.blank = 0
+        self.sos = odim - 1
+        self.eos = odim - 1
+        self.odim = odim
+        self.ignore_id = ignore_id
+        self.subsample = get_subsample(args, mode="asr", arch="transformer")
+        # self.lsm_weight = a
+        self.criterion = LabelSmoothingLoss(
+            self.odim,
+            self.ignore_id,
+            args.lsm_weight,
+            args.transformer_length_normalized_loss,
+        )
+        self.adim = args.adim
+        self.mtlalpha = args.mtlalpha
+        if args.mtlalpha > 0.0:
+            self.ctc = CTC(
+                odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True
+            )
+        else:
+            self.ctc = None
+        if args.report_cer or args.report_wer:
+            self.error_calculator = ErrorCalculator(
+                args.char_list,
+                args.sym_space,
+                args.sym_blank,
+                args.report_cer,
+                args.report_wer,
+            )
+        else:
+            self.error_calculator = None
+        self.rnnlm = None
+    def scorers(self):
+        """Scorers."""
+        return dict(decoder=self.decoder, ctc=CTCPrefixScorer(self.ctc, self.eos))
+    def encode(self, x, aux_x, extract_resnet_feats=False):
+        """Encode acoustic features.
+        :param ndarray x: source acoustic feature (T, D)
+        :return: encoder outputs
+        :rtype: torch.Tensor
+        """
+        self.eval()
+        if extract_resnet_feats:
+            x = torch.as_tensor(x).unsqueeze(0)
+            resnet_feats = self.encoder(
+                x,
+                None,
+                extract_resnet_feats=extract_resnet_feats,
+            )
+            return resnet_feats.squeeze(0)
+        else:
+            x = torch.as_tensor(x).unsqueeze(0)
+            aux_x = torch.as_tensor(aux_x).unsqueeze(0)
+            feat, _ = self.encoder(x, None)
+            aux_feat, _ = self.aux_encoder(aux_x, None)
+            fus_output = self.fusion(torch.cat((feat, aux_feat), dim=-1))
+            return fus_output.squeeze(0)

espnet/nets/pytorch_backend/lm/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/nets/pytorch_backend/lm/default.py ADDED Viewed

	@@ -0,0 +1,431 @@

+"""Default Recurrent Neural Network Languge Model in `lm_train.py`."""
+from typing import Any
+from typing import List
+from typing import Tuple
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from espnet.nets.lm_interface import LMInterface
+from espnet.nets.pytorch_backend.e2e_asr import to_device
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.utils.cli_utils import strtobool
+class DefaultRNNLM(BatchScorerInterface, LMInterface, nn.Module):
+    """Default RNNLM for `LMInterface` Implementation.
+    Note:
+        PyTorch seems to have memory leak when one GPU compute this after data parallel.
+        If parallel GPUs compute this, it seems to be fine.
+        See also https://github.com/espnet/espnet/issues/1075
+    """
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments to command line argument parser."""
+        parser.add_argument(
+            "--type",
+            type=str,
+            default="lstm",
+            nargs="?",
+            choices=["lstm", "gru"],
+            help="Which type of RNN to use",
+        )
+        parser.add_argument(
+            "--layer", "-l", type=int, default=2, help="Number of hidden layers"
+        )
+        parser.add_argument(
+            "--unit", "-u", type=int, default=650, help="Number of hidden units"
+        )
+        parser.add_argument(
+            "--embed-unit",
+            default=None,
+            type=int,
+            help="Number of hidden units in embedding layer, "
+            "if it is not specified, it keeps the same number with hidden units.",
+        )
+        parser.add_argument(
+            "--dropout-rate", type=float, default=0.5, help="dropout probability"
+        )
+        parser.add_argument(
+            "--emb-dropout-rate",
+            type=float,
+            default=0.0,
+            help="emb dropout probability",
+        )
+        parser.add_argument(
+            "--tie-weights",
+            type=strtobool,
+            default=False,
+            help="Tie input and output embeddings",
+        )
+        return parser
+    def __init__(self, n_vocab, args):
+        """Initialize class.
+        Args:
+            n_vocab (int): The size of the vocabulary
+            args (argparse.Namespace): configurations. see py:method:`add_arguments`
+        """
+        nn.Module.__init__(self)
+        # NOTE: for a compatibility with less than 0.5.0 version models
+        dropout_rate = getattr(args, "dropout_rate", 0.0)
+        # NOTE: for a compatibility with less than 0.6.1 version models
+        embed_unit = getattr(args, "embed_unit", None)
+        # NOTE: for a compatibility with less than 0.9.7 version models
+        emb_dropout_rate = getattr(args, "emb_dropout_rate", 0.0)
+        # NOTE: for a compatibility with less than 0.9.7 version models
+        tie_weights = getattr(args, "tie_weights", False)
+        self.model = ClassifierWithState(
+            RNNLM(
+                n_vocab,
+                args.layer,
+                args.unit,
+                embed_unit,
+                args.type,
+                dropout_rate,
+                emb_dropout_rate,
+                tie_weights,
+            )
+        )
+    def state_dict(self):
+        """Dump state dict."""
+        return self.model.state_dict()
+    def load_state_dict(self, d):
+        """Load state dict."""
+        self.model.load_state_dict(d)
+    def forward(self, x, t):
+        """Compute LM loss value from buffer sequences.
+        Args:
+            x (torch.Tensor): Input ids. (batch, len)
+            t (torch.Tensor): Target ids. (batch, len)
+        Returns:
+            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tuple of
+                loss to backward (scalar),
+                negative log-likelihood of t: -log p(t) (scalar) and
+                the number of elements in x (scalar)
+        Notes:
+            The last two return values are used
+            in perplexity: p(t)^{-n} = exp(-log p(t) / n)
+        """
+        loss = 0
+        logp = 0
+        count = torch.tensor(0).long()
+        state = None
+        batch_size, sequence_length = x.shape
+        for i in range(sequence_length):
+            # Compute the loss at this time step and accumulate it
+            state, loss_batch = self.model(state, x[:, i], t[:, i])
+            non_zeros = torch.sum(x[:, i] != 0, dtype=loss_batch.dtype)
+            loss += loss_batch.mean() * non_zeros
+            logp += torch.sum(loss_batch * non_zeros)
+            count += int(non_zeros)
+        return loss / batch_size, loss, count.to(loss.device)
+    def score(self, y, state, x):
+        """Score new token.
+        Args:
+            y (torch.Tensor): 1D torch.int64 prefix tokens.
+            state: Scorer state for prefix tokens
+            x (torch.Tensor): 2D encoder feature that generates ys.
+        Returns:
+            tuple[torch.Tensor, Any]: Tuple of
+                torch.float32 scores for next token (n_vocab)
+                and next state for ys
+        """
+        new_state, scores = self.model.predict(state, y[-1].unsqueeze(0))
+        return scores.squeeze(0), new_state
+    def final_score(self, state):
+        """Score eos.
+        Args:
+            state: Scorer state for prefix tokens
+        Returns:
+            float: final score
+        """
+        return self.model.final(state)
+    # batch beam search API (see BatchScorerInterface)
+    def batch_score(
+        self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
+    ) -> Tuple[torch.Tensor, List[Any]]:
+        """Score new token batch.
+        Args:
+            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (torch.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+        Returns:
+            tuple[torch.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+        """
+        # merge states
+        n_batch = len(ys)
+        n_layers = self.model.predictor.n_layers
+        if self.model.predictor.typ == "lstm":
+            keys = ("c", "h")
+        else:
+            keys = ("h",)
+        if states[0] is None:
+            states = None
+        else:
+            # transpose state of [batch, key, layer] into [key, layer, batch]
+            states = {
+                k: [
+                    torch.stack([states[b][k][i] for b in range(n_batch)])
+                    for i in range(n_layers)
+                ]
+                for k in keys
+            }
+        states, logp = self.model.predict(states, ys[:, -1])
+        # transpose state of [key, layer, batch] into [batch, key, layer]
+        return (
+            logp,
+            [
+                {k: [states[k][i][b] for i in range(n_layers)] for k in keys}
+                for b in range(n_batch)
+            ],
+        )
+class ClassifierWithState(nn.Module):
+    """A wrapper for pytorch RNNLM."""
+    def __init__(
+        self, predictor, lossfun=nn.CrossEntropyLoss(reduction="none"), label_key=-1
+    ):
+        """Initialize class.
+        :param torch.nn.Module predictor : The RNNLM
+        :param function lossfun : The loss function to use
+        :param int/str label_key :
+        """
+        if not (isinstance(label_key, (int, str))):
+            raise TypeError("label_key must be int or str, but is %s" % type(label_key))
+        super(ClassifierWithState, self).__init__()
+        self.lossfun = lossfun
+        self.y = None
+        self.loss = None
+        self.label_key = label_key
+        self.predictor = predictor
+    def forward(self, state, *args, **kwargs):
+        """Compute the loss value for an input and label pair.
+        Notes:
+            It also computes accuracy and stores it to the attribute.
+            When ``label_key`` is ``int``, the corresponding element in ``args``
+            is treated as ground truth labels. And when it is ``str``, the
+            element in ``kwargs`` is used.
+            The all elements of ``args`` and ``kwargs`` except the groundtruth
+            labels are features.
+            It feeds features to the predictor and compare the result
+            with ground truth labels.
+        :param torch.Tensor state : the LM state
+        :param list[torch.Tensor] args : Input minibatch
+        :param dict[torch.Tensor] kwargs : Input minibatch
+        :return loss value
+        :rtype torch.Tensor
+        """
+        if isinstance(self.label_key, int):
+            if not (-len(args) <= self.label_key < len(args)):
+                msg = "Label key %d is out of bounds" % self.label_key
+                raise ValueError(msg)
+            t = args[self.label_key]
+            if self.label_key == -1:
+                args = args[:-1]
+            else:
+                args = args[: self.label_key] + args[self.label_key + 1 :]
+        elif isinstance(self.label_key, str):
+            if self.label_key not in kwargs:
+                msg = 'Label key "%s" is not found' % self.label_key
+                raise ValueError(msg)
+            t = kwargs[self.label_key]
+            del kwargs[self.label_key]
+        self.y = None
+        self.loss = None
+        state, self.y = self.predictor(state, *args, **kwargs)
+        self.loss = self.lossfun(self.y, t)
+        return state, self.loss
+    def predict(self, state, x):
+        """Predict log probabilities for given state and input x using the predictor.
+        :param torch.Tensor state : The current state
+        :param torch.Tensor x : The input
+        :return a tuple (new state, log prob vector)
+        :rtype (torch.Tensor, torch.Tensor)
+        """
+        if hasattr(self.predictor, "normalized") and self.predictor.normalized:
+            return self.predictor(state, x)
+        else:
+            state, z = self.predictor(state, x)
+            return state, F.log_softmax(z, dim=1)
+    def buff_predict(self, state, x, n):
+        """Predict new tokens from buffered inputs."""
+        if self.predictor.__class__.__name__ == "RNNLM":
+            return self.predict(state, x)
+        new_state = []
+        new_log_y = []
+        for i in range(n):
+            state_i = None if state is None else state[i]
+            state_i, log_y = self.predict(state_i, x[i].unsqueeze(0))
+            new_state.append(state_i)
+            new_log_y.append(log_y)
+        return new_state, torch.cat(new_log_y)
+    def final(self, state, index=None):
+        """Predict final log probabilities for given state using the predictor.
+        :param state: The state
+        :return The final log probabilities
+        :rtype torch.Tensor
+        """
+        if hasattr(self.predictor, "final"):
+            if index is not None:
+                return self.predictor.final(state[index])
+            else:
+                return self.predictor.final(state)
+        else:
+            return 0.0
+# Definition of a recurrent net for language modeling
+class RNNLM(nn.Module):
+    """A pytorch RNNLM."""
+    def __init__(
+        self,
+        n_vocab,
+        n_layers,
+        n_units,
+        n_embed=None,
+        typ="lstm",
+        dropout_rate=0.5,
+        emb_dropout_rate=0.0,
+        tie_weights=False,
+    ):
+        """Initialize class.
+        :param int n_vocab: The size of the vocabulary
+        :param int n_layers: The number of layers to create
+        :param int n_units: The number of units per layer
+        :param str typ: The RNN type
+        """
+        super(RNNLM, self).__init__()
+        if n_embed is None:
+            n_embed = n_units
+        self.embed = nn.Embedding(n_vocab, n_embed)
+        if emb_dropout_rate == 0.0:
+            self.embed_drop = None
+        else:
+            self.embed_drop = nn.Dropout(emb_dropout_rate)
+        if typ == "lstm":
+            self.rnn = nn.ModuleList(
+                [nn.LSTMCell(n_embed, n_units)]
+                + [nn.LSTMCell(n_units, n_units) for _ in range(n_layers - 1)]
+            )
+        else:
+            self.rnn = nn.ModuleList(
+                [nn.GRUCell(n_embed, n_units)]
+                + [nn.GRUCell(n_units, n_units) for _ in range(n_layers - 1)]
+            )
+        self.dropout = nn.ModuleList(
+            [nn.Dropout(dropout_rate) for _ in range(n_layers + 1)]
+        )
+        self.lo = nn.Linear(n_units, n_vocab)
+        self.n_layers = n_layers
+        self.n_units = n_units
+        self.typ = typ
+        logging.info("Tie weights set to {}".format(tie_weights))
+        logging.info("Dropout set to {}".format(dropout_rate))
+        logging.info("Emb Dropout set to {}".format(emb_dropout_rate))
+        if tie_weights:
+            assert (
+                n_embed == n_units
+            ), "Tie Weights: True need embedding and final dimensions to match"
+            self.lo.weight = self.embed.weight
+        # initialize parameters from uniform distribution
+        for param in self.parameters():
+            param.data.uniform_(-0.1, 0.1)
+    def zero_state(self, batchsize):
+        """Initialize state."""
+        p = next(self.parameters())
+        return torch.zeros(batchsize, self.n_units).to(device=p.device, dtype=p.dtype)
+    def forward(self, state, x):
+        """Forward neural networks."""
+        if state is None:
+            h = [to_device(x, self.zero_state(x.size(0))) for n in range(self.n_layers)]
+            state = {"h": h}
+            if self.typ == "lstm":
+                c = [
+                    to_device(x, self.zero_state(x.size(0)))
+                    for n in range(self.n_layers)
+                ]
+                state = {"c": c, "h": h}
+        h = [None] * self.n_layers
+        if self.embed_drop is not None:
+            emb = self.embed_drop(self.embed(x))
+        else:
+            emb = self.embed(x)
+        if self.typ == "lstm":
+            c = [None] * self.n_layers
+            h[0], c[0] = self.rnn[0](
+                self.dropout[0](emb), (state["h"][0], state["c"][0])
+            )
+            for n in range(1, self.n_layers):
+                h[n], c[n] = self.rnn[n](
+                    self.dropout[n](h[n - 1]), (state["h"][n], state["c"][n])
+                )
+            state = {"c": c, "h": h}
+        else:
+            h[0] = self.rnn[0](self.dropout[0](emb), state["h"][0])
+            for n in range(1, self.n_layers):
+                h[n] = self.rnn[n](self.dropout[n](h[n - 1]), state["h"][n])
+            state = {"h": h}
+        y = self.lo(self.dropout[-1](h[-1]))
+        return state, y

espnet/nets/pytorch_backend/lm/seq_rnn.py ADDED Viewed

	@@ -0,0 +1,178 @@

+"""Sequential implementation of Recurrent Neural Network Language Model."""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from espnet.nets.lm_interface import LMInterface
+class SequentialRNNLM(LMInterface, torch.nn.Module):
+    """Sequential RNNLM.
+    See also:
+        https://github.com/pytorch/examples/blob/4581968193699de14b56527296262dd76ab43557/word_language_model/model.py
+    """
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments to command line argument parser."""
+        parser.add_argument(
+            "--type",
+            type=str,
+            default="lstm",
+            nargs="?",
+            choices=["lstm", "gru"],
+            help="Which type of RNN to use",
+        )
+        parser.add_argument(
+            "--layer", "-l", type=int, default=2, help="Number of hidden layers"
+        )
+        parser.add_argument(
+            "--unit", "-u", type=int, default=650, help="Number of hidden units"
+        )
+        parser.add_argument(
+            "--dropout-rate", type=float, default=0.5, help="dropout probability"
+        )
+        return parser
+    def __init__(self, n_vocab, args):
+        """Initialize class.
+        Args:
+            n_vocab (int): The size of the vocabulary
+            args (argparse.Namespace): configurations. see py:method:`add_arguments`
+        """
+        torch.nn.Module.__init__(self)
+        self._setup(
+            rnn_type=args.type.upper(),
+            ntoken=n_vocab,
+            ninp=args.unit,
+            nhid=args.unit,
+            nlayers=args.layer,
+            dropout=args.dropout_rate,
+        )
+    def _setup(
+        self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False
+    ):
+        self.drop = nn.Dropout(dropout)
+        self.encoder = nn.Embedding(ntoken, ninp)
+        if rnn_type in ["LSTM", "GRU"]:
+            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
+        else:
+            try:
+                nonlinearity = {"RNN_TANH": "tanh", "RNN_RELU": "relu"}[rnn_type]
+            except KeyError:
+                raise ValueError(
+                    "An invalid option for `--model` was supplied, "
+                    "options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']"
+                )
+            self.rnn = nn.RNN(
+                ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout
+            )
+        self.decoder = nn.Linear(nhid, ntoken)
+        # Optionally tie weights as in:
+        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
+        # https://arxiv.org/abs/1608.05859
+        # and
+        # "Tying Word Vectors and Word Classifiers:
+        #  A Loss Framework for Language Modeling" (Inan et al. 2016)
+        # https://arxiv.org/abs/1611.01462
+        if tie_weights:
+            if nhid != ninp:
+                raise ValueError(
+                    "When using the tied flag, nhid must be equal to emsize"
+                )
+            self.decoder.weight = self.encoder.weight
+        self._init_weights()
+        self.rnn_type = rnn_type
+        self.nhid = nhid
+        self.nlayers = nlayers
+    def _init_weights(self):
+        # NOTE: original init in pytorch/examples
+        # initrange = 0.1
+        # self.encoder.weight.data.uniform_(-initrange, initrange)
+        # self.decoder.bias.data.zero_()
+        # self.decoder.weight.data.uniform_(-initrange, initrange)
+        # NOTE: our default.py:RNNLM init
+        for param in self.parameters():
+            param.data.uniform_(-0.1, 0.1)
+    def forward(self, x, t):
+        """Compute LM loss value from buffer sequences.
+        Args:
+            x (torch.Tensor): Input ids. (batch, len)
+            t (torch.Tensor): Target ids. (batch, len)
+        Returns:
+            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tuple of
+                loss to backward (scalar),
+                negative log-likelihood of t: -log p(t) (scalar) and
+                the number of elements in x (scalar)
+        Notes:
+            The last two return values are used
+            in perplexity: p(t)^{-n} = exp(-log p(t) / n)
+        """
+        y = self._before_loss(x, None)[0]
+        mask = (x != 0).to(y.dtype)
+        loss = F.cross_entropy(y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
+        logp = loss * mask.view(-1)
+        logp = logp.sum()
+        count = mask.sum()
+        return logp / count, logp, count
+    def _before_loss(self, input, hidden):
+        emb = self.drop(self.encoder(input))
+        output, hidden = self.rnn(emb, hidden)
+        output = self.drop(output)
+        decoded = self.decoder(
+            output.view(output.size(0) * output.size(1), output.size(2))
+        )
+        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
+    def init_state(self, x):
+        """Get an initial state for decoding.
+        Args:
+            x (torch.Tensor): The encoded feature tensor
+        Returns: initial state
+        """
+        bsz = 1
+        weight = next(self.parameters())
+        if self.rnn_type == "LSTM":
+            return (
+                weight.new_zeros(self.nlayers, bsz, self.nhid),
+                weight.new_zeros(self.nlayers, bsz, self.nhid),
+            )
+        else:
+            return weight.new_zeros(self.nlayers, bsz, self.nhid)
+    def score(self, y, state, x):
+        """Score new token.
+        Args:
+            y (torch.Tensor): 1D torch.int64 prefix tokens.
+            state: Scorer state for prefix tokens
+            x (torch.Tensor): 2D encoder feature that generates ys.
+        Returns:
+            tuple[torch.Tensor, Any]: Tuple of
+                torch.float32 scores for next token (n_vocab)
+                and next state for ys
+        """
+        y, new_state = self._before_loss(y[-1].view(1, 1), state)
+        logp = y.log_softmax(dim=-1).view(-1)
+        return logp, new_state

espnet/nets/pytorch_backend/lm/transformer.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""Transformer language model."""
+from typing import Any
+from typing import List
+from typing import Tuple
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from espnet.nets.lm_interface import LMInterface
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.pytorch_backend.transformer.encoder import Encoder
+from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.utils.cli_utils import strtobool
+class TransformerLM(nn.Module, LMInterface, BatchScorerInterface):
+    """Transformer language model."""
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments to command line argument parser."""
+        parser.add_argument(
+            "--layer", type=int, default=4, help="Number of hidden layers"
+        )
+        parser.add_argument(
+            "--unit",
+            type=int,
+            default=1024,
+            help="Number of hidden units in feedforward layer",
+        )
+        parser.add_argument(
+            "--att-unit",
+            type=int,
+            default=256,
+            help="Number of hidden units in attention layer",
+        )
+        parser.add_argument(
+            "--embed-unit",
+            type=int,
+            default=128,
+            help="Number of hidden units in embedding layer",
+        )
+        parser.add_argument(
+            "--head", type=int, default=2, help="Number of multi head attention"
+        )
+        parser.add_argument(
+            "--dropout-rate", type=float, default=0.5, help="dropout probability"
+        )
+        parser.add_argument(
+            "--att-dropout-rate",
+            type=float,
+            default=0.0,
+            help="att dropout probability",
+        )
+        parser.add_argument(
+            "--emb-dropout-rate",
+            type=float,
+            default=0.0,
+            help="emb dropout probability",
+        )
+        parser.add_argument(
+            "--tie-weights",
+            type=strtobool,
+            default=False,
+            help="Tie input and output embeddings",
+        )
+        parser.add_argument(
+            "--pos-enc",
+            default="sinusoidal",
+            choices=["sinusoidal", "none"],
+            help="positional encoding",
+        )
+        return parser
+    def __init__(self, n_vocab, args):
+        """Initialize class.
+        Args:
+            n_vocab (int): The size of the vocabulary
+            args (argparse.Namespace): configurations. see py:method:`add_arguments`
+        """
+        nn.Module.__init__(self)
+        # NOTE: for a compatibility with less than 0.9.7 version models
+        emb_dropout_rate = getattr(args, "emb_dropout_rate", 0.0)
+        # NOTE: for a compatibility with less than 0.9.7 version models
+        tie_weights = getattr(args, "tie_weights", False)
+        # NOTE: for a compatibility with less than 0.9.7 version models
+        att_dropout_rate = getattr(args, "att_dropout_rate", 0.0)
+        if args.pos_enc == "sinusoidal":
+            pos_enc_class = PositionalEncoding
+        elif args.pos_enc == "none":
+            def pos_enc_class(*args, **kwargs):
+                return nn.Sequential()  # indentity
+        else:
+            raise ValueError(f"unknown pos-enc option: {args.pos_enc}")
+        self.embed = nn.Embedding(n_vocab, args.embed_unit)
+        if emb_dropout_rate == 0.0:
+            self.embed_drop = None
+        else:
+            self.embed_drop = nn.Dropout(emb_dropout_rate)
+        self.encoder = Encoder(
+            idim=args.embed_unit,
+            attention_dim=args.att_unit,
+            attention_heads=args.head,
+            linear_units=args.unit,
+            num_blocks=args.layer,
+            dropout_rate=args.dropout_rate,
+            attention_dropout_rate=att_dropout_rate,
+            input_layer="linear",
+            pos_enc_class=pos_enc_class,
+        )
+        self.decoder = nn.Linear(args.att_unit, n_vocab)
+        logging.info("Tie weights set to {}".format(tie_weights))
+        logging.info("Dropout set to {}".format(args.dropout_rate))
+        logging.info("Emb Dropout set to {}".format(emb_dropout_rate))
+        logging.info("Att Dropout set to {}".format(att_dropout_rate))
+        if tie_weights:
+            assert (
+                args.att_unit == args.embed_unit
+            ), "Tie Weights: True need embedding and final dimensions to match"
+            self.decoder.weight = self.embed.weight
+    def _target_mask(self, ys_in_pad):
+        ys_mask = ys_in_pad != 0
+        m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0)
+        return ys_mask.unsqueeze(-2) & m
+    def forward(
+        self, x: torch.Tensor, t: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute LM loss value from buffer sequences.
+        Args:
+            x (torch.Tensor): Input ids. (batch, len)
+            t (torch.Tensor): Target ids. (batch, len)
+        Returns:
+            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tuple of
+                loss to backward (scalar),
+                negative log-likelihood of t: -log p(t) (scalar) and
+                the number of elements in x (scalar)
+        Notes:
+            The last two return values are used
+            in perplexity: p(t)^{-n} = exp(-log p(t) / n)
+        """
+        xm = x != 0
+        if self.embed_drop is not None:
+            emb = self.embed_drop(self.embed(x))
+        else:
+            emb = self.embed(x)
+        h, _ = self.encoder(emb, self._target_mask(x))
+        y = self.decoder(h)
+        loss = F.cross_entropy(y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
+        mask = xm.to(dtype=loss.dtype)
+        logp = loss * mask.view(-1)
+        logp = logp.sum()
+        count = mask.sum()
+        return logp / count, logp, count
+    def score(
+        self, y: torch.Tensor, state: Any, x: torch.Tensor
+    ) -> Tuple[torch.Tensor, Any]:
+        """Score new token.
+        Args:
+            y (torch.Tensor): 1D torch.int64 prefix tokens.
+            state: Scorer state for prefix tokens
+            x (torch.Tensor): encoder feature that generates ys.
+        Returns:
+            tuple[torch.Tensor, Any]: Tuple of
+                torch.float32 scores for next token (n_vocab)
+                and next state for ys
+        """
+        y = y.unsqueeze(0)
+        if self.embed_drop is not None:
+            emb = self.embed_drop(self.embed(y))
+        else:
+            emb = self.embed(y)
+        h, _, cache = self.encoder.forward_one_step(
+            emb, self._target_mask(y), cache=state
+        )
+        h = self.decoder(h[:, -1])
+        logp = h.log_softmax(dim=-1).squeeze(0)
+        return logp, cache
+    # batch beam search API (see BatchScorerInterface)
+    def batch_score(
+        self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
+    ) -> Tuple[torch.Tensor, List[Any]]:
+        """Score new token batch (required).
+        Args:
+            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (torch.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+        Returns:
+            tuple[torch.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+        """
+        # merge states
+        n_batch = len(ys)
+        n_layers = len(self.encoder.encoders)
+        if states[0] is None:
+            batch_state = None
+        else:
+            # transpose state of [batch, layer] into [layer, batch]
+            batch_state = [
+                torch.stack([states[b][i] for b in range(n_batch)])
+                for i in range(n_layers)
+            ]
+        if self.embed_drop is not None:
+            emb = self.embed_drop(self.embed(ys))
+        else:
+            emb = self.embed(ys)
+        # batch decoding
+        h, _, states = self.encoder.forward_one_step(
+            emb, self._target_mask(ys), cache=batch_state
+        )
+        h = self.decoder(h[:, -1])
+        logp = h.log_softmax(dim=-1)
+        # transpose state of [layer, batch] into [batch, layer]
+        state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)]
+        return logp, state_list

espnet/nets/pytorch_backend/nets_utils.py ADDED Viewed

	@@ -0,0 +1,526 @@

+# -*- coding: utf-8 -*-
+"""Network related utility tools."""
+import logging
+from typing import Dict
+import numpy as np
+import torch
+def to_device(m, x):
+    """Send tensor into the device of the module.
+    Args:
+        m (torch.nn.Module): Torch module.
+        x (Tensor): Torch tensor.
+    Returns:
+        Tensor: Torch tensor located in the same place as torch module.
+    """
+    if isinstance(m, torch.nn.Module):
+        device = next(m.parameters()).device
+    elif isinstance(m, torch.Tensor):
+        device = m.device
+    else:
+        raise TypeError(
+            "Expected torch.nn.Module or torch.tensor, " f"bot got: {type(m)}"
+        )
+    return x.to(device)
+def pad_list(xs, pad_value):
+    """Perform padding for the list of tensors.
+    Args:
+        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+    Examples:
+        >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
+    """
+    n_batch = len(xs)
+    max_len = max(x.size(0) for x in xs)
+    pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
+    for i in range(n_batch):
+        pad[i, : xs[i].size(0)] = xs[i]
+    return pad
+def make_pad_mask(lengths, xs=None, length_dim=-1, maxlen=None):
+    """Make mask tensor containing indices of padded part.
+    Args:
+        lengths (LongTensor or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+    Returns:
+        Tensor: Mask tensor containing indices of padded part.
+                dtype=torch.uint8 in PyTorch 1.2-
+                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+    Examples:
+        With only lengths.
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+        With the reference tensor.
+        >>> xs = torch.zeros((3, 2, 4))
+        >>> make_pad_mask(lengths, xs)
+        tensor([[[0, 0, 0, 0],
+                 [0, 0, 0, 0]],
+                [[0, 0, 0, 1],
+                 [0, 0, 0, 1]],
+                [[0, 0, 1, 1],
+                 [0, 0, 1, 1]]], dtype=torch.uint8)
+        >>> xs = torch.zeros((3, 2, 6))
+        >>> make_pad_mask(lengths, xs)
+        tensor([[[0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1]],
+                [[0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1]],
+                [[0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8)
+        With the reference tensor and dimension indicator.
+        >>> xs = torch.zeros((3, 6, 6))
+        >>> make_pad_mask(lengths, xs, 1)
+        tensor([[[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1]],
+                [[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1]],
+                [[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1]]], dtype=torch.uint8)
+        >>> make_pad_mask(lengths, xs, 2)
+        tensor([[[0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1]],
+                [[0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1]],
+                [[0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8)
+    """
+    if length_dim == 0:
+        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
+    if not isinstance(lengths, list):
+        lengths = lengths.tolist()
+    bs = int(len(lengths))
+    if maxlen is None:
+        if xs is None:
+            maxlen = int(max(lengths))
+        else:
+            maxlen = xs.size(length_dim)
+    else:
+        assert xs is None
+        assert maxlen >= int(max(lengths))
+    seq_range = torch.arange(0, maxlen, dtype=torch.int64)
+    seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen)
+    seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    if xs is not None:
+        assert xs.size(0) == bs, (xs.size(0), bs)
+        if length_dim < 0:
+            length_dim = xs.dim() + length_dim
+        # ind = (:, None, ..., None, :, , None, ..., None)
+        ind = tuple(
+            slice(None) if i in (0, length_dim) else None for i in range(xs.dim())
+        )
+        mask = mask[ind].expand_as(xs).to(xs.device)
+    return mask
+def make_non_pad_mask(lengths, xs=None, length_dim=-1):
+    """Make mask tensor containing indices of non-padded part.
+    Args:
+        lengths (LongTensor or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+    Returns:
+        ByteTensor: mask tensor containing indices of padded part.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+    Examples:
+        With only lengths.
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1 ,1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+        With the reference tensor.
+        >>> xs = torch.zeros((3, 2, 4))
+        >>> make_non_pad_mask(lengths, xs)
+        tensor([[[1, 1, 1, 1],
+                 [1, 1, 1, 1]],
+                [[1, 1, 1, 0],
+                 [1, 1, 1, 0]],
+                [[1, 1, 0, 0],
+                 [1, 1, 0, 0]]], dtype=torch.uint8)
+        >>> xs = torch.zeros((3, 2, 6))
+        >>> make_non_pad_mask(lengths, xs)
+        tensor([[[1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0]],
+                [[1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0]],
+                [[1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0]]], dtype=torch.uint8)
+        With the reference tensor and dimension indicator.
+        >>> xs = torch.zeros((3, 6, 6))
+        >>> make_non_pad_mask(lengths, xs, 1)
+        tensor([[[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0]],
+                [[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0]],
+                [[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0]]], dtype=torch.uint8)
+        >>> make_non_pad_mask(lengths, xs, 2)
+        tensor([[[1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0]],
+                [[1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0]],
+                [[1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0]]], dtype=torch.uint8)
+    """
+    return ~make_pad_mask(lengths, xs, length_dim)
+def mask_by_length(xs, lengths, fill=0):
+    """Mask tensor according to length.
+    Args:
+        xs (Tensor): Batch of input tensor (B, `*`).
+        lengths (LongTensor or List): Batch of lengths (B,).
+        fill (int or float): Value to fill masked part.
+    Returns:
+        Tensor: Batch of masked input tensor (B, `*`).
+    Examples:
+        >>> x = torch.arange(5).repeat(3, 1) + 1
+        >>> x
+        tensor([[1, 2, 3, 4, 5],
+                [1, 2, 3, 4, 5],
+                [1, 2, 3, 4, 5]])
+        >>> lengths = [5, 3, 2]
+        >>> mask_by_length(x, lengths)
+        tensor([[1, 2, 3, 4, 5],
+                [1, 2, 3, 0, 0],
+                [1, 2, 0, 0, 0]])
+    """
+    assert xs.size(0) == len(lengths)
+    ret = xs.data.new(*xs.size()).fill_(fill)
+    for i, l in enumerate(lengths):
+        ret[i, :l] = xs[i, :l]
+    return ret
+def th_accuracy(pad_outputs, pad_targets, ignore_label):
+    """Calculate accuracy.
+    Args:
+        pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
+        pad_targets (LongTensor): Target label tensors (B, Lmax, D).
+        ignore_label (int): Ignore label id.
+    Returns:
+        float: Accuracy value (0.0 - 1.0).
+    """
+    pad_pred = pad_outputs.view(
+        pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)
+    ).argmax(2)
+    mask = pad_targets != ignore_label
+    numerator = torch.sum(
+        pad_pred.masked_select(mask) == pad_targets.masked_select(mask)
+    )
+    denominator = torch.sum(mask)
+    return float(numerator) / float(denominator)
+def to_torch_tensor(x):
+    """Change to torch.Tensor or ComplexTensor from numpy.ndarray.
+    Args:
+        x: Inputs. It should be one of numpy.ndarray, Tensor, ComplexTensor, and dict.
+    Returns:
+        Tensor or ComplexTensor: Type converted inputs.
+    Examples:
+        >>> xs = np.ones(3, dtype=np.float32)
+        >>> xs = to_torch_tensor(xs)
+        tensor([1., 1., 1.])
+        >>> xs = torch.ones(3, 4, 5)
+        >>> assert to_torch_tensor(xs) is xs
+        >>> xs = {'real': xs, 'imag': xs}
+        >>> to_torch_tensor(xs)
+        ComplexTensor(
+        Real:
+        tensor([1., 1., 1.])
+        Imag;
+        tensor([1., 1., 1.])
+        )
+    """
+    # If numpy, change to torch tensor
+    if isinstance(x, np.ndarray):
+        if x.dtype.kind == "c":
+            # Dynamically importing because torch_complex requires python3
+            from torch_complex.tensor import ComplexTensor
+            return ComplexTensor(x)
+        else:
+            return torch.from_numpy(x)
+    # If {'real': ..., 'imag': ...}, convert to ComplexTensor
+    elif isinstance(x, dict):
+        # Dynamically importing because torch_complex requires python3
+        from torch_complex.tensor import ComplexTensor
+        if "real" not in x or "imag" not in x:
+            raise ValueError("has 'real' and 'imag' keys: {}".format(list(x)))
+        # Relative importing because of using python3 syntax
+        return ComplexTensor(x["real"], x["imag"])
+    # If torch.Tensor, as it is
+    elif isinstance(x, torch.Tensor):
+        return x
+    else:
+        error = (
+            "x must be numpy.ndarray, torch.Tensor or a dict like "
+            "{{'real': torch.Tensor, 'imag': torch.Tensor}}, "
+            "but got {}".format(type(x))
+        )
+        try:
+            from torch_complex.tensor import ComplexTensor
+        except Exception:
+            # If PY2
+            raise ValueError(error)
+        else:
+            # If PY3
+            if isinstance(x, ComplexTensor):
+                return x
+            else:
+                raise ValueError(error)
+def get_subsample(train_args, mode, arch):
+    """Parse the subsampling factors from the args for the specified `mode` and `arch`.
+    Args:
+        train_args: argument Namespace containing options.
+        mode: one of ('asr', 'mt', 'st')
+        arch: one of ('rnn', 'rnn-t', 'rnn_mix', 'rnn_mulenc', 'transformer')
+    Returns:
+        np.ndarray / List[np.ndarray]: subsampling factors.
+    """
+    if arch == "transformer":
+        return np.array([1])
+    elif mode == "mt" and arch == "rnn":
+        # +1 means input (+1) and layers outputs (train_args.elayer)
+        subsample = np.ones(train_args.elayers + 1, dtype=np.int)
+        logging.warning("Subsampling is not performed for machine translation.")
+        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+        return subsample
+    elif (
+        (mode == "asr" and arch in ("rnn", "rnn-t"))
+        or (mode == "mt" and arch == "rnn")
+        or (mode == "st" and arch == "rnn")
+    ):
+        subsample = np.ones(train_args.elayers + 1, dtype=np.int)
+        if train_args.etype.endswith("p") and not train_args.etype.startswith("vgg"):
+            ss = train_args.subsample.split("_")
+            for j in range(min(train_args.elayers + 1, len(ss))):
+                subsample[j] = int(ss[j])
+        else:
+            logging.warning(
+                "Subsampling is not performed for vgg*. "
+                "It is performed in max pooling layers at CNN."
+            )
+        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+        return subsample
+    elif mode == "asr" and arch == "rnn_mix":
+        subsample = np.ones(
+            train_args.elayers_sd + train_args.elayers + 1, dtype=np.int
+        )
+        if train_args.etype.endswith("p") and not train_args.etype.startswith("vgg"):
+            ss = train_args.subsample.split("_")
+            for j in range(
+                min(train_args.elayers_sd + train_args.elayers + 1, len(ss))
+            ):
+                subsample[j] = int(ss[j])
+        else:
+            logging.warning(
+                "Subsampling is not performed for vgg*. "
+                "It is performed in max pooling layers at CNN."
+            )
+        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+        return subsample
+    elif mode == "asr" and arch == "rnn_mulenc":
+        subsample_list = []
+        for idx in range(train_args.num_encs):
+            subsample = np.ones(train_args.elayers[idx] + 1, dtype=np.int)
+            if train_args.etype[idx].endswith("p") and not train_args.etype[
+                idx
+            ].startswith("vgg"):
+                ss = train_args.subsample[idx].split("_")
+                for j in range(min(train_args.elayers[idx] + 1, len(ss))):
+                    subsample[j] = int(ss[j])
+            else:
+                logging.warning(
+                    "Encoder %d: Subsampling is not performed for vgg*. "
+                    "It is performed in max pooling layers at CNN.",
+                    idx + 1,
+                )
+            logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+            subsample_list.append(subsample)
+        return subsample_list
+    else:
+        raise ValueError("Invalid options: mode={}, arch={}".format(mode, arch))
+def rename_state_dict(
+    old_prefix: str, new_prefix: str, state_dict: Dict[str, torch.Tensor]
+):
+    """Replace keys of old prefix with new prefix in state dict."""
+    # need this list not to break the dict iterator
+    old_keys = [k for k in state_dict if k.startswith(old_prefix)]
+    if len(old_keys) > 0:
+        logging.warning(f"Rename: {old_prefix} -> {new_prefix}")
+    for k in old_keys:
+        v = state_dict.pop(k)
+        new_k = k.replace(old_prefix, new_prefix)
+        state_dict[new_k] = v
+def get_activation(act):
+    """Return activation function."""
+    # Lazy load to avoid unused import
+    from espnet.nets.pytorch_backend.conformer.swish import Swish
+    activation_funcs = {
+        "hardtanh": torch.nn.Hardtanh,
+        "tanh": torch.nn.Tanh,
+        "relu": torch.nn.ReLU,
+        "selu": torch.nn.SELU,
+        "swish": Swish,
+    }
+    return activation_funcs[act]()
+class MLPHead(torch.nn.Module):
+    def __init__(self, idim, hdim, odim, norm="batchnorm"):
+        super(MLPHead, self).__init__()
+        self.norm = norm
+        self.fc1 = torch.nn.Linear(idim, hdim)
+        if norm == "batchnorm":
+            self.bn1 = torch.nn.BatchNorm1d(hdim)
+        elif norm == "layernorm":
+            self.norm1 = torch.nn.LayerNorm(hdim)
+        self.nonlin1 = torch.nn.ReLU(inplace=True)
+        self.fc2 = torch.nn.Linear( hdim, odim)
+    def forward(self, x):
+        x = self.fc1(x)
+        if self.norm == "batchnorm":
+            x = self.bn1(x.transpose(1,2)).transpose(1,2)
+        elif self.norm == "layernorm":
+            x = self.norm1(x)
+        x = self.nonlin1(x)
+        x = self.fc2(x)
+        return x

espnet/nets/pytorch_backend/transformer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/nets/pytorch_backend/transformer/add_sos_eos.py ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Unility funcitons for Transformer."""
+import torch
+def add_sos_eos(ys_pad, sos, eos, ignore_id):
+    """Add <sos> and <eos> labels.
+    :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
+    :param int sos: index of <sos>
+    :param int eos: index of <eeos>
+    :param int ignore_id: index of padding
+    :return: padded tensor (B, Lmax)
+    :rtype: torch.Tensor
+    :return: padded tensor (B, Lmax)
+    :rtype: torch.Tensor
+    """
+    from espnet.nets.pytorch_backend.nets_utils import pad_list
+    _sos = ys_pad.new([sos])
+    _eos = ys_pad.new([eos])
+    ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
+    ys_in = [torch.cat([_sos, y], dim=0) for y in ys]
+    ys_out = [torch.cat([y, _eos], dim=0) for y in ys]
+    return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)

espnet/nets/pytorch_backend/transformer/attention.py ADDED Viewed

	@@ -0,0 +1,280 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Multi-Head Attention layer definition."""
+import math
+import numpy
+import torch
+from torch import nn
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, n_head, n_feat, dropout_rate):
+        """Construct an MultiHeadedAttention object."""
+        super(MultiHeadedAttention, self).__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout_rate)
+    def forward_qkv(self, query, key, value):
+        """Transform query, key and value.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        return q, k, v
+    def forward_attention(self, value, scores, mask, rtn_attn=False):
+        """Compute attention context vector.
+        Args:
+            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+            rtn_attn (boolean): Flag of return attention score
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            min_value = float(
+                numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min
+            )
+            scores = scores.masked_fill(mask, min_value)
+            self.attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0
+            )  # (batch, head, time1, time2)
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
+        )  # (batch, time1, d_model)
+        if rtn_attn:
+            return self.linear_out(x), self.attn
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(self, query, key, value, mask, rtn_attn=False):
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+            rtn_attn (boolean): Flag of return attention score
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask, rtn_attn)
+class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding (old version).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+    """
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x):
+        """Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)
+        if self.zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)))
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+        return x
+    def forward(self, query, key, value, pos_emb, mask):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (torch.Tensor): Positional embedding tensor (#batch, time1, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, time1)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k
+        )  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask)
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+    """
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x):
+        """Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)[
+            :, :, :, : x.size(-1) // 2 + 1
+        ]  # only keep the positions from 0 to time2
+        if self.zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)), device=x.device)
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+        return x
+    def forward(self, query, key, value, pos_emb, mask):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, 2*time1-1, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, 2*time1-1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, 2*time1-1)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k
+        )  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask)

espnet/nets/pytorch_backend/transformer/convolution.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#                Northwestern Polytechnical University (Pengcheng Guo)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""ConvolutionModule definition."""
+import torch
+from torch import nn
+class ConvolutionModule(nn.Module):
+    """ConvolutionModule in Conformer model.
+    :param int channels: channels of cnn
+    :param int kernel_size: kernerl size of cnn
+    """
+    def __init__(self, channels, kernel_size, bias=True):
+        """Construct an ConvolutionModule object."""
+        super(ConvolutionModule, self).__init__()
+        # kernerl_size should be a odd number for 'SAME' padding
+        assert (kernel_size - 1) % 2 == 0
+        self.pointwise_cov1 = nn.Conv1d(
+            channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias,
+        )
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            groups=channels,
+            bias=bias,
+        )
+        self.norm = nn.BatchNorm1d(channels)
+        self.pointwise_cov2 = nn.Conv1d(
+            channels, channels, kernel_size=1, stride=1, padding=0, bias=bias,
+        )
+        self.activation = Swish()
+    def forward(self, x):
+        """Compute covolution module.
+        :param torch.Tensor x: (batch, time, size)
+        :return torch.Tensor: convoluted `value` (batch, time, d_model)
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)
+        # GLU mechanism
+        x = self.pointwise_cov1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.activation(self.norm(x))
+        x = self.pointwise_cov2(x)
+        return x.transpose(1, 2)
+class Swish(nn.Module):
+    """Construct an Swish object."""
+    def forward(self, x):
+        """Return Swich activation function."""
+        return x * torch.sigmoid(x)

espnet/nets/pytorch_backend/transformer/decoder.py ADDED Viewed

	@@ -0,0 +1,229 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Decoder definition."""
+from typing import Any
+from typing import List
+from typing import Tuple
+import torch
+from espnet.nets.pytorch_backend.nets_utils import rename_state_dict
+from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
+from espnet.nets.pytorch_backend.transformer.decoder_layer import DecoderLayer
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+from espnet.nets.scorer_interface import BatchScorerInterface
+def _pre_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+    # https://github.com/espnet/espnet/commit/3d422f6de8d4f03673b89e1caef698745ec749ea#diff-bffb1396f038b317b2b64dd96e6d3563
+    rename_state_dict(prefix + "output_norm.", prefix + "after_norm.", state_dict)
+class Decoder(BatchScorerInterface, torch.nn.Module):
+    """Transfomer decoder module.
+    :param int odim: output dim
+    :param int attention_dim: dimention of attention
+    :param int attention_heads: the number of heads of multi head attention
+    :param int linear_units: the number of units of position-wise feed forward
+    :param int num_blocks: the number of decoder blocks
+    :param float dropout_rate: dropout rate
+    :param float attention_dropout_rate: dropout rate for attention
+    :param str or torch.nn.Module input_layer: input layer type
+    :param bool use_output_layer: whether to use output layer
+    :param class pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+    :param bool normalize_before: whether to use layer_norm before the first block
+    :param bool concat_after: whether to concat attention layer's input and output
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    """
+    def __init__(
+        self,
+        odim,
+        attention_dim=256,
+        attention_heads=4,
+        linear_units=2048,
+        num_blocks=6,
+        dropout_rate=0.1,
+        positional_dropout_rate=0.1,
+        self_attention_dropout_rate=0.0,
+        src_attention_dropout_rate=0.0,
+        input_layer="embed",
+        use_output_layer=True,
+        pos_enc_class=PositionalEncoding,
+        normalize_before=True,
+        concat_after=False,
+    ):
+        """Construct an Decoder object."""
+        torch.nn.Module.__init__(self)
+        self._register_load_state_dict_pre_hook(_pre_hook)
+        if input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(odim, attention_dim),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(odim, attention_dim),
+                torch.nn.LayerNorm(attention_dim),
+                torch.nn.Dropout(dropout_rate),
+                torch.nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer, pos_enc_class(attention_dim, positional_dropout_rate)
+            )
+        else:
+            raise NotImplementedError("only `embed` or torch.nn.Module is supported.")
+        self.normalize_before = normalize_before
+        self.decoders = repeat(
+            num_blocks,
+            lambda: DecoderLayer(
+                attention_dim,
+                MultiHeadedAttention(
+                    attention_heads, attention_dim, self_attention_dropout_rate
+                ),
+                MultiHeadedAttention(
+                    attention_heads, attention_dim, src_attention_dropout_rate
+                ),
+                PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+        if use_output_layer:
+            self.output_layer = torch.nn.Linear(attention_dim, odim)
+        else:
+            self.output_layer = None
+    def forward(self, tgt, tgt_mask, memory, memory_mask):
+        """Forward decoder.
+        :param torch.Tensor tgt: input token ids, int64 (batch, maxlen_out)
+                                 if input_layer == "embed"
+                                 input tensor (batch, maxlen_out, #mels)
+                                 in the other cases
+        :param torch.Tensor tgt_mask: input token mask,  (batch, maxlen_out)
+                                      dtype=torch.uint8 in PyTorch 1.2-
+                                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+        :param torch.Tensor memory: encoded memory, float32  (batch, maxlen_in, feat)
+        :param torch.Tensor memory_mask: encoded memory mask,  (batch, maxlen_in)
+                                         dtype=torch.uint8 in PyTorch 1.2-
+                                         dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+        :return x: decoded token score before softmax (batch, maxlen_out, token)
+                   if use_output_layer is True,
+                   final block outputs (batch, maxlen_out, attention_dim)
+                   in the other cases
+        :rtype: torch.Tensor
+        :return tgt_mask: score mask before softmax (batch, maxlen_out)
+        :rtype: torch.Tensor
+        """
+        x = self.embed(tgt)
+        x, tgt_mask, memory, memory_mask = self.decoders(
+            x, tgt_mask, memory, memory_mask
+        )
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.output_layer is not None:
+            x = self.output_layer(x)
+        return x, tgt_mask
+    def forward_one_step(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
+        """Forward one step.
+        :param torch.Tensor tgt: input token ids, int64 (batch, maxlen_out)
+        :param torch.Tensor tgt_mask: input token mask,  (batch, maxlen_out)
+                                      dtype=torch.uint8 in PyTorch 1.2-
+                                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+        :param torch.Tensor memory: encoded memory, float32  (batch, maxlen_in, feat)
+        :param List[torch.Tensor] cache:
+            cached output list of (batch, max_time_out-1, size)
+        :return y, cache: NN output value and cache per `self.decoders`.
+            `y.shape` is (batch, maxlen_out, token)
+        :rtype: Tuple[torch.Tensor, List[torch.Tensor]]
+        """
+        x = self.embed(tgt)
+        if cache is None:
+            cache = [None] * len(self.decoders)
+        new_cache = []
+        for c, decoder in zip(cache, self.decoders):
+            x, tgt_mask, memory, memory_mask = decoder(
+                x, tgt_mask, memory, memory_mask, cache=c
+            )
+            new_cache.append(x)
+        if self.normalize_before:
+            y = self.after_norm(x[:, -1])
+        else:
+            y = x[:, -1]
+        if self.output_layer is not None:
+            y = torch.log_softmax(self.output_layer(y), dim=-1)
+        return y, new_cache
+    # beam search API (see ScorerInterface)
+    def score(self, ys, state, x):
+        """Score."""
+        ys_mask = subsequent_mask(len(ys), device=x.device).unsqueeze(0)
+        logp, state = self.forward_one_step(
+            ys.unsqueeze(0), ys_mask, x.unsqueeze(0), cache=state
+        )
+        return logp.squeeze(0), state
+    # batch beam search API (see BatchScorerInterface)
+    def batch_score(
+        self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
+    ) -> Tuple[torch.Tensor, List[Any]]:
+        """Score new token batch (required).
+        Args:
+            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (torch.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+        Returns:
+            tuple[torch.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+        """
+        # merge states
+        n_batch = len(ys)
+        n_layers = len(self.decoders)
+        if states[0] is None:
+            batch_state = None
+        else:
+            # transpose state of [batch, layer] into [layer, batch]
+            batch_state = [
+                torch.stack([states[b][l] for b in range(n_batch)])
+                for l in range(n_layers)
+            ]
+        # batch decoding
+        ys_mask = subsequent_mask(ys.size(-1), device=xs.device).unsqueeze(0)
+        logp, states = self.forward_one_step(ys, ys_mask, xs, cache=batch_state)
+        # transpose state of [layer, batch] into [batch, layer]
+        state_list = [[states[l][b] for l in range(n_layers)] for b in range(n_batch)]
+        return logp, state_list

espnet/nets/pytorch_backend/transformer/decoder_layer.py ADDED Viewed

	@@ -0,0 +1,121 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Decoder self-attention layer definition."""
+import torch
+from torch import nn
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+class DecoderLayer(nn.Module):
+    """Single decoder layer module.
+    :param int size: input dim
+    :param espnet.nets.pytorch_backend.transformer.attention.MultiHeadedAttention
+        self_attn: self attention module
+    :param espnet.nets.pytorch_backend.transformer.attention.MultiHeadedAttention
+        src_attn: source attention module
+    :param espnet.nets.pytorch_backend.transformer.positionwise_feed_forward.
+        PositionwiseFeedForward feed_forward: feed forward layer module
+    :param float dropout_rate: dropout rate
+    :param bool normalize_before: whether to use layer_norm before the first block
+    :param bool concat_after: whether to concat attention layer's input and output
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    """
+    def __init__(
+        self,
+        size,
+        self_attn,
+        src_attn,
+        feed_forward,
+        dropout_rate,
+        normalize_before=True,
+        concat_after=False,
+    ):
+        """Construct an DecoderLayer object."""
+        super(DecoderLayer, self).__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.src_attn = src_attn
+        self.feed_forward = feed_forward
+        self.norm1 = LayerNorm(size)
+        self.norm2 = LayerNorm(size)
+        self.norm3 = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear1 = nn.Linear(size + size, size)
+            self.concat_linear2 = nn.Linear(size + size, size)
+    def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None):
+        """Compute decoded features.
+        Args:
+            tgt (torch.Tensor):
+                decoded previous target features (batch, max_time_out, size)
+            tgt_mask (torch.Tensor): mask for x (batch, max_time_out)
+            memory (torch.Tensor): encoded source features (batch, max_time_in, size)
+            memory_mask (torch.Tensor): mask for memory (batch, max_time_in)
+            cache (torch.Tensor): cached output (batch, max_time_out-1, size)
+        """
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        if cache is None:
+            tgt_q = tgt
+            tgt_q_mask = tgt_mask
+        else:
+            # compute only the last frame query keeping dim: max_time_out -> 1
+            assert cache.shape == (
+                tgt.shape[0],
+                tgt.shape[1] - 1,
+                self.size,
+            ), f"{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
+            tgt_q = tgt[:, -1:, :]
+            residual = residual[:, -1:, :]
+            tgt_q_mask = None
+            if tgt_mask is not None:
+                tgt_q_mask = tgt_mask[:, -1:, :]
+        if self.concat_after:
+            tgt_concat = torch.cat(
+                (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1
+            )
+            x = residual + self.concat_linear1(tgt_concat)
+        else:
+            x = residual + self.dropout(self.self_attn(tgt_q, tgt, tgt, tgt_q_mask))
+        if not self.normalize_before:
+            x = self.norm1(x)
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        if self.concat_after:
+            x_concat = torch.cat(
+                (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1
+            )
+            x = residual + self.concat_linear2(x_concat)
+        else:
+            x = residual + self.dropout(self.src_attn(x, memory, memory, memory_mask))
+        if not self.normalize_before:
+            x = self.norm2(x)
+        residual = x
+        if self.normalize_before:
+            x = self.norm3(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm3(x)
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+        return x, tgt_mask, memory, memory_mask

espnet/nets/pytorch_backend/transformer/embedding.py ADDED Viewed

	@@ -0,0 +1,217 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Positional Encoding Module."""
+import math
+import torch
+def _pre_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+    """Perform pre-hook in load_state_dict for backward compatibility.
+    Note:
+        We saved self.pe until v.0.5.2 but we have omitted it later.
+        Therefore, we remove the item "pe" from `state_dict` for backward compatibility.
+    """
+    k = prefix + "pe"
+    if k in state_dict:
+        state_dict.pop(k)
+class PositionalEncoding(torch.nn.Module):
+    """Positional encoding.
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        reverse (bool): Whether to reverse the input position. Only for
+        the class LegacyRelPositionalEncoding. We remove it in the current
+        class RelPositionalEncoding.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
+        """Construct an PositionalEncoding object."""
+        super(PositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.reverse = reverse
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+        self._register_load_state_dict_pre_hook(_pre_hook)
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        if self.reverse:
+            position = torch.arange(
+                x.size(1) - 1, -1, -1.0, dtype=torch.float32
+            ).unsqueeze(1)
+        else:
+            position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+class ScaledPositionalEncoding(PositionalEncoding):
+    """Scaled positional encoding module.
+    See Sec. 3.2  https://arxiv.org/abs/1809.08895
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Initialize class."""
+        super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len)
+        self.alpha = torch.nn.Parameter(torch.tensor(1.0))
+    def reset_parameters(self):
+        """Reset parameters."""
+        self.alpha.data = torch.tensor(1.0)
+    def forward(self, x):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x + self.alpha * self.pe[:, : x.size(1)]
+        return self.dropout(x)
+class LegacyRelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module (old version).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Initialize class."""
+        super().__init__(
+            d_model=d_model,
+            dropout_rate=dropout_rate,
+            max_len=max_len,
+            reverse=True,
+        )
+    def forward(self, x):
+        """Compute positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+            torch.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.pe[:, : x.size(1)]
+        return self.dropout(x), self.dropout(pos_emb)
+class RelPositionalEncoding(torch.nn.Module):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Construct an PositionalEncoding object."""
+        super(RelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.pe[
+            :,
+            self.pe.size(1) // 2 - x.size(1) + 1 : self.pe.size(1) // 2 + x.size(1),
+        ]
+        return self.dropout(x), self.dropout(pos_emb)

espnet/nets/pytorch_backend/transformer/encoder.py ADDED Viewed

	@@ -0,0 +1,283 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Encoder definition."""
+import torch
+from espnet.nets.pytorch_backend.nets_utils import rename_state_dict
+#from espnet.nets.pytorch_backend.transducer.vgg import VGG2L
+from espnet.nets.pytorch_backend.transformer.attention import (
+    MultiHeadedAttention,  # noqa: H301
+    RelPositionMultiHeadedAttention,  # noqa: H301
+    LegacyRelPositionMultiHeadedAttention,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.convolution import ConvolutionModule
+from espnet.nets.pytorch_backend.transformer.embedding import (
+    PositionalEncoding,  # noqa: H301
+    RelPositionalEncoding,  # noqa: H301
+    LegacyRelPositionalEncoding, # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.encoder_layer import EncoderLayer
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import MultiLayeredConv1d
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling
+from espnet.nets.pytorch_backend.transformer.raw_embeddings import VideoEmbedding
+from espnet.nets.pytorch_backend.transformer.raw_embeddings import AudioEmbedding
+from espnet.nets.pytorch_backend.backbones.conv3d_extractor  import Conv3dResNet
+from espnet.nets.pytorch_backend.backbones.conv1d_extractor  import Conv1dResNet
+def _pre_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+    # https://github.com/espnet/espnet/commit/21d70286c354c66c0350e65dc098d2ee236faccc#diff-bffb1396f038b317b2b64dd96e6d3563
+    rename_state_dict(prefix + "input_layer.", prefix + "embed.", state_dict)
+    # https://github.com/espnet/espnet/commit/3d422f6de8d4f03673b89e1caef698745ec749ea#diff-bffb1396f038b317b2b64dd96e6d3563
+    rename_state_dict(prefix + "norm.", prefix + "after_norm.", state_dict)
+class Encoder(torch.nn.Module):
+    """Transformer encoder module.
+    :param int idim: input dim
+    :param int attention_dim: dimention of attention
+    :param int attention_heads: the number of heads of multi head attention
+    :param int linear_units: the number of units of position-wise feed forward
+    :param int num_blocks: the number of decoder blocks
+    :param float dropout_rate: dropout rate
+    :param float attention_dropout_rate: dropout rate in attention
+    :param float positional_dropout_rate: dropout rate after adding positional encoding
+    :param str or torch.nn.Module input_layer: input layer type
+    :param class pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+    :param bool normalize_before: whether to use layer_norm before the first block
+    :param bool concat_after: whether to concat attention layer's input and output
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    :param str positionwise_layer_type: linear of conv1d
+    :param int positionwise_conv_kernel_size: kernel size of positionwise conv1d layer
+    :param str encoder_attn_layer_type: encoder attention layer type
+    :param bool macaron_style: whether to use macaron style for positionwise layer
+    :param bool use_cnn_module: whether to use convolution module
+    :param bool zero_triu: whether to zero the upper triangular part of attention matrix
+    :param int cnn_module_kernel: kernerl size of convolution module
+    :param int padding_idx: padding_idx for input_layer=embed
+    """
+    def __init__(
+        self,
+        idim,
+        attention_dim=256,
+        attention_heads=4,
+        linear_units=2048,
+        num_blocks=6,
+        dropout_rate=0.1,
+        positional_dropout_rate=0.1,
+        attention_dropout_rate=0.0,
+        input_layer="conv2d",
+        pos_enc_class=PositionalEncoding,
+        normalize_before=True,
+        concat_after=False,
+        positionwise_layer_type="linear",
+        positionwise_conv_kernel_size=1,
+        macaron_style=False,
+        encoder_attn_layer_type="mha",
+        use_cnn_module=False,
+        zero_triu=False,
+        cnn_module_kernel=31,
+        padding_idx=-1,
+        relu_type="prelu",
+        a_upsample_ratio=1,
+    ):
+        """Construct an Encoder object."""
+        super(Encoder, self).__init__()
+        self._register_load_state_dict_pre_hook(_pre_hook)
+        if encoder_attn_layer_type == "rel_mha":
+            pos_enc_class = RelPositionalEncoding
+        elif encoder_attn_layer_type == "legacy_rel_mha":
+            pos_enc_class = LegacyRelPositionalEncoding
+        # -- frontend module.
+        if input_layer == "conv1d":
+            self.frontend = Conv1dResNet(
+                relu_type=relu_type,
+                a_upsample_ratio=a_upsample_ratio,
+            )
+        elif input_layer == "conv3d":
+            self.frontend = Conv3dResNet(relu_type=relu_type)
+        else:
+            self.frontend = None
+        # -- backend module.
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(idim, attention_dim),
+                torch.nn.LayerNorm(attention_dim),
+                torch.nn.Dropout(dropout_rate),
+                torch.nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, dropout_rate),
+            )
+        elif input_layer == "vgg2l":
+            self.embed = VGG2L(idim, attention_dim)
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer, pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer in ["conv1d", "conv3d"]:
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(512, attention_dim),
+                pos_enc_class(attention_dim, positional_dropout_rate)
+            )
+        elif input_layer is None:
+            self.embed = torch.nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate)
+            )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        self.normalize_before = normalize_before
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (attention_dim, linear_units, dropout_rate)
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (
+                attention_dim,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (
+                attention_dim,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+        if encoder_attn_layer_type == "mha":
+            encoder_attn_layer = MultiHeadedAttention
+            encoder_attn_layer_args = (
+                attention_heads,
+                attention_dim,
+                attention_dropout_rate,
+            )
+        elif encoder_attn_layer_type == "legacy_rel_mha":
+            encoder_attn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_attn_layer_args = (
+                attention_heads,
+                attention_dim,
+                attention_dropout_rate,
+            )
+        elif encoder_attn_layer_type == "rel_mha":
+            encoder_attn_layer = RelPositionMultiHeadedAttention
+            encoder_attn_layer_args = (
+                attention_heads,
+                attention_dim,
+                attention_dropout_rate,
+                zero_triu,
+            )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " + encoder_attn_layer)
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel)
+        self.encoders = repeat(
+            num_blocks,
+            lambda: EncoderLayer(
+                attention_dim,
+                encoder_attn_layer(*encoder_attn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+                macaron_style,
+            ),
+        )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+    def forward(self, xs, masks, extract_resnet_feats=False):
+        """Encode input sequence.
+        :param torch.Tensor xs: input tensor
+        :param torch.Tensor masks: input mask
+        :param str extract_features: the position for feature extraction
+        :return: position embedded tensor and mask
+        :rtype Tuple[torch.Tensor, torch.Tensor]:
+        """
+        if isinstance(self.frontend, (Conv1dResNet, Conv3dResNet)):
+            xs = self.frontend(xs)
+        if extract_resnet_feats:
+            return xs
+        if isinstance(self.embed, Conv2dSubsampling):
+            xs, masks = self.embed(xs, masks)
+        else:
+            xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+        if isinstance(xs, tuple):
+            xs = xs[0]
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
+    def forward_one_step(self, xs, masks, cache=None):
+        """Encode input frame.
+        :param torch.Tensor xs: input tensor
+        :param torch.Tensor masks: input mask
+        :param List[torch.Tensor] cache: cache tensors
+        :return: position embedded tensor, mask and new cache
+        :rtype Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
+        """
+        if isinstance(self.frontend, (Conv1dResNet, Conv3dResNet)):
+            xs = self.frontend(xs)
+        if isinstance(self.embed, Conv2dSubsampling):
+            xs, masks = self.embed(xs, masks)
+        else:
+            xs = self.embed(xs)
+        if cache is None:
+            cache = [None for _ in range(len(self.encoders))]
+        new_cache = []
+        for c, e in zip(cache, self.encoders):
+            xs, masks = e(xs, masks, cache=c)
+            new_cache.append(xs)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks, new_cache

espnet/nets/pytorch_backend/transformer/encoder_layer.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Encoder self-attention layer definition."""
+import copy
+import torch
+from torch import nn
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+class EncoderLayer(nn.Module):
+    """Encoder layer module.
+    :param int size: input dim
+    :param espnet.nets.pytorch_backend.transformer.attention.
+        MultiHeadedAttention self_attn: self attention module
+        RelPositionMultiHeadedAttention self_attn: self attention module
+    :param espnet.nets.pytorch_backend.transformer.positionwise_feed_forward.
+        PositionwiseFeedForward feed_forward:
+        feed forward module
+    :param espnet.nets.pytorch_backend.transformer.convolution.
+        ConvolutionModule feed_foreard:
+        feed forward module
+    :param float dropout_rate: dropout rate
+    :param bool normalize_before: whether to use layer_norm before the first block
+    :param bool concat_after: whether to concat attention layer's input and output
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    :param bool macaron_style: whether to use macaron style for PositionwiseFeedForward
+    """
+    def __init__(
+        self,
+        size,
+        self_attn,
+        feed_forward,
+        conv_module,
+        dropout_rate,
+        normalize_before=True,
+        concat_after=False,
+        macaron_style=False,
+    ):
+        """Construct an EncoderLayer object."""
+        super(EncoderLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.ff_scale = 1.0
+        self.conv_module = conv_module
+        self.macaron_style = macaron_style
+        self.norm_ff = LayerNorm(size)  # for the FNN module
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        if self.macaron_style:
+            self.feed_forward_macaron = copy.deepcopy(feed_forward)
+            self.ff_scale = 0.5
+            # for another FNN module in macaron style
+            self.norm_ff_macaron = LayerNorm(size)
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(size)  # for the CNN module
+            self.norm_final = LayerNorm(size)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+    def forward(self, x_input, mask, cache=None):
+        """Compute encoded features.
+        :param torch.Tensor x_input: encoded source features (batch, max_time_in, size)
+        :param torch.Tensor mask: mask for x (batch, max_time_in)
+        :param torch.Tensor cache: cache for x (batch, max_time_in - 1, size)
+        :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+        # whether to use macaron style
+        if self.macaron_style:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+        if pos_emb is not None:
+            x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+        else:
+            x_att = self.self_attn(x_q, x, x, mask)
+        if self.concat_after:
+            x_concat = torch.cat((x, x_att), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+        else:
+            return x, mask

espnet/nets/pytorch_backend/transformer/label_smoothing_loss.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Label smoothing module."""
+import torch
+from torch import nn
+class LabelSmoothingLoss(nn.Module):
+    """Label-smoothing loss.
+    :param int size: the number of class
+    :param int padding_idx: ignored class id
+    :param float smoothing: smoothing rate (0.0 means the conventional CE)
+    :param bool normalize_length: normalize loss by sequence length if True
+    :param torch.nn.Module criterion: loss function to be smoothed
+    """
+    def __init__(
+        self,
+        size,
+        padding_idx,
+        smoothing,
+        normalize_length=False,
+        criterion=nn.KLDivLoss(reduction="none"),
+    ):
+        """Construct an LabelSmoothingLoss object."""
+        super(LabelSmoothingLoss, self).__init__()
+        self.criterion = criterion
+        self.padding_idx = padding_idx
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+        self.size = size
+        self.true_dist = None
+        self.normalize_length = normalize_length
+    def forward(self, x, target):
+        """Compute loss between x and target.
+        :param torch.Tensor x: prediction (batch, seqlen, class)
+        :param torch.Tensor target:
+            target signal masked with self.padding_id (batch, seqlen)
+        :return: scalar float value
+        :rtype torch.Tensor
+        """
+        assert x.size(2) == self.size
+        batch_size = x.size(0)
+        x = x.view(-1, self.size)
+        target = target.view(-1)
+        with torch.no_grad():
+            true_dist = x.clone()
+            true_dist.fill_(self.smoothing / (self.size - 1))
+            ignore = target == self.padding_idx  # (B,)
+            total = len(target) - ignore.sum().item()
+            target = target.masked_fill(ignore, 0)  # avoid -1 index
+            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
+        kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
+        denom = total if self.normalize_length else batch_size
+        return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom

espnet/nets/pytorch_backend/transformer/layer_norm.py ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Layer normalization module."""
+import torch
+class LayerNorm(torch.nn.LayerNorm):
+    """Layer normalization module.
+    :param int nout: output dim size
+    :param int dim: dimension to be normalized
+    """
+    def __init__(self, nout, dim=-1):
+        """Construct an LayerNorm object."""
+        super(LayerNorm, self).__init__(nout, eps=1e-12)
+        self.dim = dim
+    def forward(self, x):
+        """Apply layer normalization.
+        :param torch.Tensor x: input tensor
+        :return: layer normalized tensor
+        :rtype torch.Tensor
+        """
+        if self.dim == -1:
+            return super(LayerNorm, self).forward(x)
+        return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)

espnet/nets/pytorch_backend/transformer/mask.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Mask module."""
+from distutils.version import LooseVersion
+import torch
+is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
+# LooseVersion('1.2.0') == LooseVersion(torch.__version__) can't include e.g. 1.2.0+aaa
+is_torch_1_2 = (
+    LooseVersion("1.3") > LooseVersion(torch.__version__) >= LooseVersion("1.2")
+)
+datatype = torch.bool if is_torch_1_2_plus else torch.uint8
+def subsequent_mask(size, device="cpu", dtype=datatype):
+    """Create mask for subsequent steps (1, size, size).
+    :param int size: size of mask
+    :param str device: "cpu" or "cuda" or torch.Tensor.device
+    :param torch.dtype dtype: result dtype
+    :rtype: torch.Tensor
+    >>> subsequent_mask(3)
+    [[1, 0, 0],
+     [1, 1, 0],
+     [1, 1, 1]]
+    """
+    if is_torch_1_2 and dtype == torch.bool:
+        # torch=1.2 doesn't support tril for bool tensor
+        ret = torch.ones(size, size, device=device, dtype=torch.uint8)
+        return torch.tril(ret, out=ret).type(dtype)
+    else:
+        ret = torch.ones(size, size, device=device, dtype=dtype)
+        return torch.tril(ret, out=ret)
+def target_mask(ys_in_pad, ignore_id):
+    """Create mask for decoder self-attention.
+    :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
+    :param int ignore_id: index of padding
+    :param torch.dtype dtype: result dtype
+    :rtype: torch.Tensor
+    """
+    ys_mask = ys_in_pad != ignore_id
+    m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0)
+    return ys_mask.unsqueeze(-2) & m

espnet/nets/pytorch_backend/transformer/multi_layer_conv.py ADDED Viewed

	@@ -0,0 +1,105 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Layer modules for FFT block in FastSpeech (Feed-forward Transformer)."""
+import torch
+class MultiLayeredConv1d(torch.nn.Module):
+    """Multi-layered conv1d for Transformer block.
+    This is a module of multi-leyered conv1d designed
+    to replace positionwise feed-forward network
+    in Transforner block, which is introduced in
+    `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+    """
+    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
+        """Initialize MultiLayeredConv1d module.
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
+        """
+        super(MultiLayeredConv1d, self).__init__()
+        self.w_1 = torch.nn.Conv1d(
+            in_chans,
+            hidden_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.w_2 = torch.nn.Conv1d(
+            hidden_chans,
+            in_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.dropout = torch.nn.Dropout(dropout_rate)
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Batch of input tensors (B, ..., in_chans).
+        Returns:
+            Tensor: Batch of output tensors (B, ..., hidden_chans).
+        """
+        x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
+        return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1)
+class Conv1dLinear(torch.nn.Module):
+    """Conv1D + Linear for Transformer block.
+    A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
+    """
+    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
+        """Initialize Conv1dLinear module.
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
+        """
+        super(Conv1dLinear, self).__init__()
+        self.w_1 = torch.nn.Conv1d(
+            in_chans,
+            hidden_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.w_2 = torch.nn.Linear(hidden_chans, in_chans)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Batch of input tensors (B, ..., in_chans).
+        Returns:
+            Tensor: Batch of output tensors (B, ..., hidden_chans).
+        """
+        x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
+        return self.w_2(self.dropout(x))

espnet/nets/pytorch_backend/transformer/optimizer.py ADDED Viewed

	@@ -0,0 +1,75 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Optimizer module."""
+import torch
+class NoamOpt(object):
+    """Optim wrapper that implements rate."""
+    def __init__(self, model_size, factor, warmup, optimizer):
+        """Construct an NoamOpt object."""
+        self.optimizer = optimizer
+        self._step = 0
+        self.warmup = warmup
+        self.factor = factor
+        self.model_size = model_size
+        self._rate = 0
+    @property
+    def param_groups(self):
+        """Return param_groups."""
+        return self.optimizer.param_groups
+    def step(self):
+        """Update parameters and rate."""
+        self._step += 1
+        rate = self.rate()
+        for p in self.optimizer.param_groups:
+            p["lr"] = rate
+        self._rate = rate
+        self.optimizer.step()
+    def rate(self, step=None):
+        """Implement `lrate` above."""
+        if step is None:
+            step = self._step
+        return (
+            self.factor
+            * self.model_size ** (-0.5)
+            * min(step ** (-0.5), step * self.warmup ** (-1.5))
+        )
+    def zero_grad(self):
+        """Reset gradient."""
+        self.optimizer.zero_grad()
+    def state_dict(self):
+        """Return state_dict."""
+        return {
+            "_step": self._step,
+            "warmup": self.warmup,
+            "factor": self.factor,
+            "model_size": self.model_size,
+            "_rate": self._rate,
+            "optimizer": self.optimizer.state_dict(),
+        }
+    def load_state_dict(self, state_dict):
+        """Load state_dict."""
+        for key, value in state_dict.items():
+            if key == "optimizer":
+                self.optimizer.load_state_dict(state_dict["optimizer"])
+            else:
+                setattr(self, key, value)
+def get_std_opt(model, d_model, warmup, factor):
+    """Get standard NoamOpt."""
+    base = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
+    return NoamOpt(d_model, factor, warmup, base)

espnet/nets/pytorch_backend/transformer/plot.py ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+import logging
+import matplotlib.pyplot as plt
+import numpy
+from espnet.asr import asr_utils
+def _plot_and_save_attention(att_w, filename, xtokens=None, ytokens=None):
+    # dynamically import matplotlib due to not found error
+    from matplotlib.ticker import MaxNLocator
+    import os
+    d = os.path.dirname(filename)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    w, h = plt.figaspect(1.0 / len(att_w))
+    fig = plt.Figure(figsize=(w * 2, h * 2))
+    axes = fig.subplots(1, len(att_w))
+    if len(att_w) == 1:
+        axes = [axes]
+    for ax, aw in zip(axes, att_w):
+        # plt.subplot(1, len(att_w), h)
+        ax.imshow(aw.astype(numpy.float32), aspect="auto")
+        ax.set_xlabel("Input")
+        ax.set_ylabel("Output")
+        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
+        ax.yaxis.set_major_locator(MaxNLocator(integer=True))
+        # Labels for major ticks
+        if xtokens is not None:
+            ax.set_xticks(numpy.linspace(0, len(xtokens) - 1, len(xtokens)))
+            ax.set_xticks(numpy.linspace(0, len(xtokens) - 1, 1), minor=True)
+            ax.set_xticklabels(xtokens + [""], rotation=40)
+        if ytokens is not None:
+            ax.set_yticks(numpy.linspace(0, len(ytokens) - 1, len(ytokens)))
+            ax.set_yticks(numpy.linspace(0, len(ytokens) - 1, 1), minor=True)
+            ax.set_yticklabels(ytokens + [""])
+    fig.tight_layout()
+    return fig
+def savefig(plot, filename):
+    plot.savefig(filename)
+    plt.clf()
+def plot_multi_head_attention(
+    data,
+    attn_dict,
+    outdir,
+    suffix="png",
+    savefn=savefig,
+    ikey="input",
+    iaxis=0,
+    okey="output",
+    oaxis=0,
+):
+    """Plot multi head attentions.
+    :param dict data: utts info from json file
+    :param dict[str, torch.Tensor] attn_dict: multi head attention dict.
+        values should be torch.Tensor (head, input_length, output_length)
+    :param str outdir: dir to save fig
+    :param str suffix: filename suffix including image type (e.g., png)
+    :param savefn: function to save
+    """
+    for name, att_ws in attn_dict.items():
+        for idx, att_w in enumerate(att_ws):
+            filename = "%s/%s.%s.%s" % (outdir, data[idx][0], name, suffix)
+            dec_len = int(data[idx][1][okey][oaxis]["shape"][0])
+            enc_len = int(data[idx][1][ikey][iaxis]["shape"][0])
+            xtokens, ytokens = None, None
+            if "encoder" in name:
+                att_w = att_w[:, :enc_len, :enc_len]
+                # for MT
+                if "token" in data[idx][1][ikey][iaxis].keys():
+                    xtokens = data[idx][1][ikey][iaxis]["token"].split()
+                    ytokens = xtokens[:]
+            elif "decoder" in name:
+                if "self" in name:
+                    att_w = att_w[:, : dec_len + 1, : dec_len + 1]  # +1 for <sos>
+                else:
+                    att_w = att_w[:, : dec_len + 1, :enc_len]  # +1 for <sos>
+                    # for MT
+                    if "token" in data[idx][1][ikey][iaxis].keys():
+                        xtokens = data[idx][1][ikey][iaxis]["token"].split()
+                # for ASR/ST/MT
+                if "token" in data[idx][1][okey][oaxis].keys():
+                    ytokens = ["<sos>"] + data[idx][1][okey][oaxis]["token"].split()
+                    if "self" in name:
+                        xtokens = ytokens[:]
+            else:
+                logging.warning("unknown name for shaping attention")
+            fig = _plot_and_save_attention(att_w, filename, xtokens, ytokens)
+            savefn(fig, filename)
+class PlotAttentionReport(asr_utils.PlotAttentionReport):
+    def plotfn(self, *args, **kwargs):
+        kwargs["ikey"] = self.ikey
+        kwargs["iaxis"] = self.iaxis
+        kwargs["okey"] = self.okey
+        kwargs["oaxis"] = self.oaxis
+        plot_multi_head_attention(*args, **kwargs)
+    def __call__(self, trainer):
+        attn_dict = self.get_attention_weights()
+        suffix = "ep.{.updater.epoch}.png".format(trainer)
+        self.plotfn(self.data, attn_dict, self.outdir, suffix, savefig)
+    def get_attention_weights(self):
+        batch = self.converter([self.transform(self.data)], self.device)
+        if isinstance(batch, tuple):
+            att_ws = self.att_vis_fn(*batch)
+        elif isinstance(batch, dict):
+            att_ws = self.att_vis_fn(**batch)
+        return att_ws
+    def log_attentions(self, logger, step):
+        def log_fig(plot, filename):
+            from os.path import basename
+            logger.add_figure(basename(filename), plot, step)
+            plt.clf()
+        attn_dict = self.get_attention_weights()
+        self.plotfn(self.data, attn_dict, self.outdir, "", log_fig)

espnet/nets/pytorch_backend/transformer/positionwise_feed_forward.py ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Positionwise feed forward layer definition."""
+import torch
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    :param int idim: input dimenstion
+    :param int hidden_units: number of hidden units
+    :param float dropout_rate: dropout rate
+    """
+    def __init__(self, idim, hidden_units, dropout_rate):
+        """Construct an PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+    def forward(self, x):
+        """Forward funciton."""
+        return self.w_2(self.dropout(torch.relu(self.w_1(x))))

espnet/nets/pytorch_backend/transformer/raw_embeddings.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+import logging
+from espnet.nets.pytorch_backend.backbones.conv3d_extractor  import Conv3dResNet
+from espnet.nets.pytorch_backend.backbones.conv1d_extractor  import Conv1dResNet
+class VideoEmbedding(torch.nn.Module):
+    """Video Embedding
+    :param int idim: input dim
+    :param int odim: output dim
+    :param flaot dropout_rate: dropout rate
+    """
+    def __init__(self, idim, odim, dropout_rate, pos_enc_class, backbone_type="resnet", relu_type="prelu"):
+        super(VideoEmbedding, self).__init__()
+        self.trunk = Conv3dResNet(
+            backbone_type=backbone_type,
+            relu_type=relu_type
+        )
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(idim, odim),
+            pos_enc_class,
+        )
+    def forward(self, x, x_mask, extract_feats=None):
+        """video embedding for x
+        :param torch.Tensor x: input tensor
+        :param torch.Tensor x_mask: input mask
+        :param str extract_features: the position for feature extraction
+        :return: subsampled x and mask
+        :rtype Tuple[torch.Tensor, torch.Tensor]
+        """
+        x_resnet, x_mask = self.trunk(x, x_mask)
+        x = self.out(x_resnet)
+        if extract_feats:
+            return x, x_mask, x_resnet
+        else:
+            return x, x_mask
+class AudioEmbedding(torch.nn.Module):
+    """Audio Embedding
+    :param int idim: input dim
+    :param int odim: output dim
+    :param flaot dropout_rate: dropout rate
+    """
+    def __init__(self, idim, odim, dropout_rate, pos_enc_class, relu_type="prelu", a_upsample_ratio=1):
+        super(AudioEmbedding, self).__init__()
+        self.trunk = Conv1dResNet(
+            relu_type=relu_type,
+            a_upsample_ratio=a_upsample_ratio,
+        )
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(idim, odim),
+            pos_enc_class,
+        )
+    def forward(self, x, x_mask, extract_feats=None):
+        """audio embedding for x
+        :param torch.Tensor x: input tensor
+        :param torch.Tensor x_mask: input mask
+        :param str extract_features: the position for feature extraction
+        :return: subsampled x and mask
+        :rtype Tuple[torch.Tensor, torch.Tensor]
+        """
+        x_resnet, x_mask = self.trunk(x, x_mask)
+        x = self.out(x_resnet)
+        if extract_feats:
+            return x, x_mask, x_resnet
+        else:
+            return x, x_mask

espnet/nets/pytorch_backend/transformer/repeat.py ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Repeat the same layer definition."""
+import torch
+class MultiSequential(torch.nn.Sequential):
+    """Multi-input multi-output torch.nn.Sequential."""
+    def forward(self, *args):
+        """Repeat."""
+        for m in self:
+            args = m(*args)
+        return args
+def repeat(N, fn):
+    """Repeat module N times.
+    :param int N: repeat time
+    :param function fn: function to generate module
+    :return: repeated modules
+    :rtype: MultiSequential
+    """
+    return MultiSequential(*[fn() for _ in range(N)])

espnet/nets/pytorch_backend/transformer/subsampling.py ADDED Viewed

	@@ -0,0 +1,52 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Subsampling layer definition."""
+import torch
+class Conv2dSubsampling(torch.nn.Module):
+    """Convolutional 2D subsampling (to 1/4 length).
+    :param int idim: input dim
+    :param int odim: output dim
+    :param flaot dropout_rate: dropout rate
+    :param nn.Module pos_enc_class: positional encoding layer
+    """
+    def __init__(self, idim, odim, dropout_rate, pos_enc_class):
+        """Construct an Conv2dSubsampling object."""
+        super(Conv2dSubsampling, self).__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim), pos_enc_class,
+        )
+    def forward(self, x, x_mask):
+        """Subsample x.
+        :param torch.Tensor x: input tensor
+        :param torch.Tensor x_mask: input mask
+        :return: subsampled x and mask
+        :rtype Tuple[torch.Tensor, torch.Tensor]
+               or Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        # if RelPositionalEncoding, x: Tuple[torch.Tensor, torch.Tensor]
+        # else x: torch.Tensor
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:2]

espnet/nets/scorer_interface.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""Scorer interface module."""
+from typing import Any
+from typing import List
+from typing import Tuple
+import torch
+import warnings
+class ScorerInterface:
+    """Scorer interface for beam search.
+    The scorer performs scoring of the all tokens in vocabulary.
+    Examples:
+        * Search heuristics
+            * :class:`espnet.nets.scorers.length_bonus.LengthBonus`
+        * Decoder networks of the sequence-to-sequence models
+            * :class:`espnet.nets.pytorch_backend.nets.transformer.decoder.Decoder`
+            * :class:`espnet.nets.pytorch_backend.nets.rnn.decoders.Decoder`
+        * Neural language models
+            * :class:`espnet.nets.pytorch_backend.lm.transformer.TransformerLM`
+            * :class:`espnet.nets.pytorch_backend.lm.default.DefaultRNNLM`
+            * :class:`espnet.nets.pytorch_backend.lm.seq_rnn.SequentialRNNLM`
+    """
+    def init_state(self, x: torch.Tensor) -> Any:
+        """Get an initial state for decoding (optional).
+        Args:
+            x (torch.Tensor): The encoded feature tensor
+        Returns: initial state
+        """
+        return None
+    def select_state(self, state: Any, i: int, new_id: int = None) -> Any:
+        """Select state with relative ids in the main beam search.
+        Args:
+            state: Decoder state for prefix tokens
+            i (int): Index to select a state in the main beam search
+            new_id (int): New label index to select a state if necessary
+        Returns:
+            state: pruned state
+        """
+        return None if state is None else state[i]
+    def score(
+        self, y: torch.Tensor, state: Any, x: torch.Tensor
+    ) -> Tuple[torch.Tensor, Any]:
+        """Score new token (required).
+        Args:
+            y (torch.Tensor): 1D torch.int64 prefix tokens.
+            state: Scorer state for prefix tokens
+            x (torch.Tensor): The encoder feature that generates ys.
+        Returns:
+            tuple[torch.Tensor, Any]: Tuple of
+                scores for next token that has a shape of `(n_vocab)`
+                and next state for ys
+        """
+        raise NotImplementedError
+    def final_score(self, state: Any) -> float:
+        """Score eos (optional).
+        Args:
+            state: Scorer state for prefix tokens
+        Returns:
+            float: final score
+        """
+        return 0.0
+class BatchScorerInterface(ScorerInterface):
+    """Batch scorer interface."""
+    def batch_init_state(self, x: torch.Tensor) -> Any:
+        """Get an initial state for decoding (optional).
+        Args:
+            x (torch.Tensor): The encoded feature tensor
+        Returns: initial state
+        """
+        return self.init_state(x)
+    def batch_score(
+        self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
+    ) -> Tuple[torch.Tensor, List[Any]]:
+        """Score new token batch (required).
+        Args:
+            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (torch.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+        Returns:
+            tuple[torch.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+        """
+        warnings.warn(
+            "{} batch score is implemented through for loop not parallelized".format(
+                self.__class__.__name__
+            )
+        )
+        scores = list()
+        outstates = list()
+        for i, (y, state, x) in enumerate(zip(ys, states, xs)):
+            score, outstate = self.score(y, state, x)
+            outstates.append(outstate)
+            scores.append(score)
+        scores = torch.cat(scores, 0).view(ys.shape[0], -1)
+        return scores, outstates
+class PartialScorerInterface(ScorerInterface):
+    """Partial scorer interface for beam search.
+    The partial scorer performs scoring when non-partial scorer finished scoring,
+    and receives pre-pruned next tokens to score because it is too heavy to score
+    all the tokens.
+    Examples:
+         * Prefix search for connectionist-temporal-classification models
+             * :class:`espnet.nets.scorers.ctc.CTCPrefixScorer`
+    """
+    def score_partial(
+        self, y: torch.Tensor, next_tokens: torch.Tensor, state: Any, x: torch.Tensor
+    ) -> Tuple[torch.Tensor, Any]:
+        """Score new token (required).
+        Args:
+            y (torch.Tensor): 1D prefix token
+            next_tokens (torch.Tensor): torch.int64 next token to score
+            state: decoder state for prefix tokens
+            x (torch.Tensor): The encoder feature that generates ys
+        Returns:
+            tuple[torch.Tensor, Any]:
+                Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
+                and next state for ys
+        """
+        raise NotImplementedError
+class BatchPartialScorerInterface(BatchScorerInterface, PartialScorerInterface):
+    """Batch partial scorer interface for beam search."""
+    def batch_score_partial(
+        self,
+        ys: torch.Tensor,
+        next_tokens: torch.Tensor,
+        states: List[Any],
+        xs: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Any]:
+        """Score new token (required).
+        Args:
+            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
+            next_tokens (torch.Tensor): torch.int64 tokens to score (n_batch, n_token).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (torch.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+        Returns:
+            tuple[torch.Tensor, Any]:
+                Tuple of a score tensor for ys that has a shape `(n_batch, n_vocab)`
+                and next states for ys
+        """
+        raise NotImplementedError

espnet/nets/scorers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Initialize sub package."""

espnet/nets/scorers/ctc.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""ScorerInterface implementation for CTC."""
+import numpy as np
+import torch
+from espnet.nets.ctc_prefix_score import CTCPrefixScore
+from espnet.nets.ctc_prefix_score import CTCPrefixScoreTH
+from espnet.nets.scorer_interface import BatchPartialScorerInterface
+class CTCPrefixScorer(BatchPartialScorerInterface):
+    """Decoder interface wrapper for CTCPrefixScore."""
+    def __init__(self, ctc: torch.nn.Module, eos: int):
+        """Initialize class.
+        Args:
+            ctc (torch.nn.Module): The CTC implementation.
+                For example, :class:`espnet.nets.pytorch_backend.ctc.CTC`
+            eos (int): The end-of-sequence id.
+        """
+        self.ctc = ctc
+        self.eos = eos
+        self.impl = None
+    def init_state(self, x: torch.Tensor):
+        """Get an initial state for decoding.
+        Args:
+            x (torch.Tensor): The encoded feature tensor
+        Returns: initial state
+        """
+        logp = self.ctc.log_softmax(x.unsqueeze(0)).detach().squeeze(0).cpu().numpy()
+        # TODO(karita): use CTCPrefixScoreTH
+        self.impl = CTCPrefixScore(logp, 0, self.eos, np)
+        return 0, self.impl.initial_state()
+    def select_state(self, state, i, new_id=None):
+        """Select state with relative ids in the main beam search.
+        Args:
+            state: Decoder state for prefix tokens
+            i (int): Index to select a state in the main beam search
+            new_id (int): New label id to select a state if necessary
+        Returns:
+            state: pruned state
+        """
+        if type(state) == tuple:
+            if len(state) == 2:  # for CTCPrefixScore
+                sc, st = state
+                return sc[i], st[i]
+            else:  # for CTCPrefixScoreTH (need new_id > 0)
+                r, log_psi, f_min, f_max, scoring_idmap = state
+                s = log_psi[i, new_id].expand(log_psi.size(1))
+                if scoring_idmap is not None:
+                    return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max
+                else:
+                    return r[:, :, i, new_id], s, f_min, f_max
+        return None if state is None else state[i]
+    def score_partial(self, y, ids, state, x):
+        """Score new token.
+        Args:
+            y (torch.Tensor): 1D prefix token
+            next_tokens (torch.Tensor): torch.int64 next token to score
+            state: decoder state for prefix tokens
+            x (torch.Tensor): 2D encoder feature that generates ys
+        Returns:
+            tuple[torch.Tensor, Any]:
+                Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
+                and next state for ys
+        """
+        prev_score, state = state
+        presub_score, new_st = self.impl(y.cpu(), ids.cpu(), state)
+        tscore = torch.as_tensor(
+            presub_score - prev_score, device=x.device, dtype=x.dtype
+        )
+        return tscore, (presub_score, new_st)
+    def batch_init_state(self, x: torch.Tensor):
+        """Get an initial state for decoding.
+        Args:
+            x (torch.Tensor): The encoded feature tensor
+        Returns: initial state
+        """
+        logp = self.ctc.log_softmax(x.unsqueeze(0))  # assuming batch_size = 1
+        xlen = torch.tensor([logp.size(1)])
+        self.impl = CTCPrefixScoreTH(logp, xlen, 0, self.eos)
+        return None
+    def batch_score_partial(self, y, ids, state, x):
+        """Score new token.
+        Args:
+            y (torch.Tensor): 1D prefix token
+            ids (torch.Tensor): torch.int64 next token to score
+            state: decoder state for prefix tokens
+            x (torch.Tensor): 2D encoder feature that generates ys
+        Returns:
+            tuple[torch.Tensor, Any]:
+                Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
+                and next state for ys
+        """
+        batch_state = (
+            (
+                torch.stack([s[0] for s in state], dim=2),
+                torch.stack([s[1] for s in state]),
+                state[0][2],
+                state[0][3],
+            )
+            if state[0] is not None
+            else None
+        )
+        return self.impl(y, batch_state, ids)
+    def extend_prob(self, x: torch.Tensor):
+        """Extend probs for decoding.
+        This extension is for streaming decoding
+        as in Eq (14) in https://arxiv.org/abs/2006.14941
+        Args:
+            x (torch.Tensor): The encoded feature tensor
+        """
+        logp = self.ctc.log_softmax(x.unsqueeze(0))
+        self.impl.extend_prob(logp)
+    def extend_state(self, state):
+        """Extend state for decoding.
+        This extension is for streaming decoding
+        as in Eq (14) in https://arxiv.org/abs/2006.14941
+        Args:
+            state: The states of hyps
+        Returns: exteded state
+        """
+        new_state = []
+        for s in state:
+            new_state.append(self.impl.extend_state(s))
+        return new_state

espnet/nets/scorers/length_bonus.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""Length bonus module."""
+from typing import Any
+from typing import List
+from typing import Tuple
+import torch
+from espnet.nets.scorer_interface import BatchScorerInterface
+class LengthBonus(BatchScorerInterface):
+    """Length bonus in beam search."""
+    def __init__(self, n_vocab: int):
+        """Initialize class.
+        Args:
+            n_vocab (int): The number of tokens in vocabulary for beam search
+        """
+        self.n = n_vocab
+    def score(self, y, state, x):
+        """Score new token.
+        Args:
+            y (torch.Tensor): 1D torch.int64 prefix tokens.
+            state: Scorer state for prefix tokens
+            x (torch.Tensor): 2D encoder feature that generates ys.
+        Returns:
+            tuple[torch.Tensor, Any]: Tuple of
+                torch.float32 scores for next token (n_vocab)
+                and None
+        """
+        return torch.tensor([1.0], device=x.device, dtype=x.dtype).expand(self.n), None
+    def batch_score(
+        self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
+    ) -> Tuple[torch.Tensor, List[Any]]:
+        """Score new token batch.
+        Args:
+            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (torch.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+        Returns:
+            tuple[torch.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+        """
+        return (
+            torch.tensor([1.0], device=xs.device, dtype=xs.dtype).expand(
+                ys.shape[0], self.n
+            ),
+            None,
+        )

espnet/utils/cli_utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from collections.abc import Sequence
+from distutils.util import strtobool as dist_strtobool
+import sys
+import numpy
+def strtobool(x):
+    # distutils.util.strtobool returns integer, but it's confusing,
+    return bool(dist_strtobool(x))
+def get_commandline_args():
+    extra_chars = [
+        " ",
+        ";",
+        "&",
+        "(",
+        ")",
+        "|",
+        "^",
+        "<",
+        ">",
+        "?",
+        "*",
+        "[",
+        "]",
+        "$",
+        "`",
+        '"',
+        "\\",
+        "!",
+        "{",
+        "}",
+    ]
+    # Escape the extra characters for shell
+    argv = [
+        arg.replace("'", "'\\''")
+        if all(char not in arg for char in extra_chars)
+        else "'" + arg.replace("'", "'\\''") + "'"
+        for arg in sys.argv
+    ]
+    return sys.executable + " " + " ".join(argv)
+def is_scipy_wav_style(value):
+    # If Tuple[int, numpy.ndarray] or not
+    return (
+        isinstance(value, Sequence)
+        and len(value) == 2
+        and isinstance(value[0], int)
+        and isinstance(value[1], numpy.ndarray)
+    )
+def assert_scipy_wav_style(value):
+    assert is_scipy_wav_style(
+        value
+    ), "Must be Tuple[int, numpy.ndarray], but got {}".format(
+        type(value)
+        if not isinstance(value, Sequence)
+        else "{}[{}]".format(type(value), ", ".join(str(type(v)) for v in value))
+    )

espnet/utils/dynamic_import.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import importlib
+def dynamic_import(import_path, alias=dict()):
+    """dynamic import module and class
+    :param str import_path: syntax 'module_name:class_name'
+        e.g., 'espnet.transform.add_deltas:AddDeltas'
+    :param dict alias: shortcut for registered class
+    :return: imported class
+    """
+    if import_path not in alias and ":" not in import_path:
+        raise ValueError(
+            "import_path should be one of {} or "
+            'include ":", e.g. "espnet.transform.add_deltas:AddDeltas" : '
+            "{}".format(set(alias), import_path)
+        )
+    if ":" not in import_path:
+        import_path = alias[import_path]
+    module_name, objname = import_path.split(":")
+    m = importlib.import_module(module_name)
+    return getattr(m, objname)

espnet/utils/fill_missing_args.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# -*- coding: utf-8 -*-
+# Copyright 2018 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+import argparse
+import logging
+def fill_missing_args(args, add_arguments):
+    """Fill missing arguments in args.
+    Args:
+        args (Namespace or None): Namesapce containing hyperparameters.
+        add_arguments (function): Function to add arguments.
+    Returns:
+        Namespace: Arguments whose missing ones are filled with default value.
+    Examples:
+        >>> from argparse import Namespace
+        >>> from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import Tacotron2
+        >>> args = Namespace()
+        >>> fill_missing_args(args, Tacotron2.add_arguments_fn)
+        Namespace(aconv_chans=32, aconv_filts=15, adim=512, atype='location', ...)
+    """
+    # check argument type
+    assert isinstance(args, argparse.Namespace) or args is None
+    assert callable(add_arguments)
+    # get default arguments
+    default_args, _ = add_arguments(argparse.ArgumentParser()).parse_known_args()
+    # convert to dict
+    args = {} if args is None else vars(args)
+    default_args = vars(default_args)
+    for key, value in default_args.items():
+        if key not in args:
+            logging.info(
+                'attribute "%s" does not exist. use default %s.' % (key, str(value))
+            )
+            args[key] = value
+    return argparse.Namespace(**args)

pipelines/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

pipelines/data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file