namelessai
/

audiosr

Model card Files Files and versions Community

namelessai commited on Dec 5, 2024

Commit

13ddedd

verified ·

1 Parent(s): a496765

Upload 3 files

Browse files

Files changed (3) hide show

utilities/model.py +167 -0
utilities/sampler.py +588 -0
utilities/tools.py +541 -0

utilities/model.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import torch
+import audiosr.hifigan as hifigan
+def get_vocoder_config():
+    return {
+        "resblock": "1",
+        "num_gpus": 6,
+        "batch_size": 16,
+        "learning_rate": 0.0002,
+        "adam_b1": 0.8,
+        "adam_b2": 0.99,
+        "lr_decay": 0.999,
+        "seed": 1234,
+        "upsample_rates": [5, 4, 2, 2, 2],
+        "upsample_kernel_sizes": [16, 16, 8, 4, 4],
+        "upsample_initial_channel": 1024,
+        "resblock_kernel_sizes": [3, 7, 11],
+        "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        "segment_size": 8192,
+        "num_mels": 64,
+        "num_freq": 1025,
+        "n_fft": 1024,
+        "hop_size": 160,
+        "win_size": 1024,
+        "sampling_rate": 16000,
+        "fmin": 0,
+        "fmax": 8000,
+        "fmax_for_loss": None,
+        "num_workers": 4,
+        "dist_config": {
+            "dist_backend": "nccl",
+            "dist_url": "tcp://localhost:54321",
+            "world_size": 1,
+        },
+    }
+def get_vocoder_config_48k():
+    return {
+        "resblock": "1",
+        "num_gpus": 8,
+        "batch_size": 128,
+        "learning_rate": 0.0001,
+        "adam_b1": 0.8,
+        "adam_b2": 0.99,
+        "lr_decay": 0.999,
+        "seed": 1234,
+        "upsample_rates": [6, 5, 4, 2, 2],
+        "upsample_kernel_sizes": [12, 10, 8, 4, 4],
+        "upsample_initial_channel": 1536,
+        "resblock_kernel_sizes": [3, 7, 11, 15],
+        "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        "segment_size": 15360,
+        "num_mels": 256,
+        "n_fft": 2048,
+        "hop_size": 480,
+        "win_size": 2048,
+        "sampling_rate": 48000,
+        "fmin": 20,
+        "fmax": 24000,
+        "fmax_for_loss": None,
+        "num_workers": 8,
+        "dist_config": {
+            "dist_backend": "nccl",
+            "dist_url": "tcp://localhost:18273",
+            "world_size": 1,
+        },
+    }
+def get_available_checkpoint_keys(model, ckpt):
+    state_dict = torch.load(ckpt)["state_dict"]
+    current_state_dict = model.state_dict()
+    new_state_dict = {}
+    for k in state_dict.keys():
+        if (
+            k in current_state_dict.keys()
+            and current_state_dict[k].size() == state_dict[k].size()
+        ):
+            new_state_dict[k] = state_dict[k]
+        else:
+            print("==> WARNING: Skipping %s" % k)
+    print(
+        "%s out of %s keys are matched"
+        % (len(new_state_dict.keys()), len(state_dict.keys()))
+    )
+    return new_state_dict
+def get_param_num(model):
+    num_param = sum(param.numel() for param in model.parameters())
+    return num_param
+def torch_version_orig_mod_remove(state_dict):
+    new_state_dict = {}
+    new_state_dict["generator"] = {}
+    for key in state_dict["generator"].keys():
+        if "_orig_mod." in key:
+            new_state_dict["generator"][key.replace("_orig_mod.", "")] = state_dict[
+                "generator"
+            ][key]
+        else:
+            new_state_dict["generator"][key] = state_dict["generator"][key]
+    return new_state_dict
+def get_vocoder(config, device, mel_bins):
+    name = "HiFi-GAN"
+    speaker = ""
+    if name == "MelGAN":
+        if speaker == "LJSpeech":
+            vocoder = torch.hub.load(
+                "descriptinc/melgan-neurips", "load_melgan", "linda_johnson"
+            )
+        elif speaker == "universal":
+            vocoder = torch.hub.load(
+                "descriptinc/melgan-neurips", "load_melgan", "multi_speaker"
+            )
+        vocoder.mel2wav.eval()
+        vocoder.mel2wav.to(device)
+    elif name == "HiFi-GAN":
+        if mel_bins == 64:
+            config = get_vocoder_config()
+            config = hifigan.AttrDict(config)
+            vocoder = hifigan.Generator_old(config)
+            # print("Load hifigan/g_01080000")
+            # ckpt = torch.load(os.path.join(ROOT, "hifigan/g_01080000"))
+            # ckpt = torch.load(os.path.join(ROOT, "hifigan/g_00660000"))
+            # ckpt = torch_version_orig_mod_remove(ckpt)
+            # vocoder.load_state_dict(ckpt["generator"])
+            vocoder.eval()
+            vocoder.remove_weight_norm()
+            vocoder.to(device)
+        else:
+            config = get_vocoder_config_48k()
+            config = hifigan.AttrDict(config)
+            vocoder = hifigan.Generator_old(config)
+            # print("Load hifigan/g_01080000")
+            # ckpt = torch.load(os.path.join(ROOT, "hifigan/g_01080000"))
+            # ckpt = torch.load(os.path.join(ROOT, "hifigan/g_00660000"))
+            # ckpt = torch_version_orig_mod_remove(ckpt)
+            # vocoder.load_state_dict(ckpt["generator"])
+            vocoder.eval()
+            vocoder.remove_weight_norm()
+            vocoder.to(device)
+    return vocoder
+def vocoder_infer(mels, vocoder, lengths=None):
+    with torch.no_grad():
+        wavs = vocoder(mels).squeeze(1)
+    wavs = (wavs.cpu().numpy() * 32768).astype("int16")
+    if lengths is not None:
+        wavs = wavs[:, :lengths]
+    # wavs = [wav for wav in wavs]
+    # for i in range(len(mels)):
+    #     if lengths is not None:
+    #         wavs[i] = wavs[i][: lengths[i]]
+    return wavs

utilities/sampler.py ADDED Viewed

	@@ -0,0 +1,588 @@

+from typing import Iterator, List, Optional, Union
+from collections import Counter
+import logging
+from operator import itemgetter
+import random
+import numpy as np
+from torch.utils.data import DistributedSampler
+from torch.utils.data.sampler import Sampler
+LOGGER = logging.getLogger(__name__)
+from torch.utils.data import Dataset, Sampler
+class DatasetFromSampler(Dataset):
+    """Dataset to create indexes from `Sampler`.
+    Args:
+        sampler: PyTorch sampler
+    """
+    def __init__(self, sampler: Sampler):
+        """Initialisation for DatasetFromSampler."""
+        self.sampler = sampler
+        self.sampler_list = None
+    def __getitem__(self, index: int):
+        """Gets element of the dataset.
+        Args:
+            index: index of the element in the dataset
+        Returns:
+            Single element by index
+        """
+        if self.sampler_list is None:
+            self.sampler_list = list(self.sampler)
+        return self.sampler_list[index]
+    def __len__(self) -> int:
+        """
+        Returns:
+            int: length of the dataset
+        """
+        return len(self.sampler)
+class BalanceClassSampler(Sampler):
+    """Allows you to create stratified sample on unbalanced classes.
+    Args:
+        labels: list of class label for each elem in the dataset
+        mode: Strategy to balance classes.
+            Must be one of [downsampling, upsampling]
+    Python API examples:
+    .. code-block:: python
+        import os
+        from torch import nn, optim
+        from torch.utils.data import DataLoader
+        from catalyst import dl
+        from catalyst.data import ToTensor, BalanceClassSampler
+        from catalyst.contrib.datasets import MNIST
+        train_data = MNIST(os.getcwd(), train=True, download=True, transform=ToTensor())
+        train_labels = train_data.targets.cpu().numpy().tolist()
+        train_sampler = BalanceClassSampler(train_labels, mode=5000)
+        valid_data = MNIST(os.getcwd(), train=False)
+        loaders = {
+            "train": DataLoader(train_data, sampler=train_sampler, batch_size=32),
+            "valid": DataLoader(valid_data, batch_size=32),
+        }
+        model = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10))
+        criterion = nn.CrossEntropyLoss()
+        optimizer = optim.Adam(model.parameters(), lr=0.02)
+        runner = dl.SupervisedRunner()
+        # model training
+        runner.train(
+            model=model,
+            criterion=criterion,
+            optimizer=optimizer,
+            loaders=loaders,
+            num_epochs=1,
+            logdir="./logs",
+            valid_loader="valid",
+            valid_metric="loss",
+            minimize_valid_metric=True,
+            verbose=True,
+        )
+    """
+    def __init__(self, labels: List[int], mode: Union[str, int] = "downsampling"):
+        """Sampler initialisation."""
+        super().__init__(labels)
+        labels = np.array(labels)
+        samples_per_class = {label: (labels == label).sum() for label in set(labels)}
+        self.lbl2idx = {
+            label: np.arange(len(labels))[labels == label].tolist()
+            for label in set(labels)
+        }
+        if isinstance(mode, str):
+            assert mode in ["downsampling", "upsampling"]
+        if isinstance(mode, int) or mode == "upsampling":
+            samples_per_class = (
+                mode if isinstance(mode, int) else max(samples_per_class.values())
+            )
+        else:
+            samples_per_class = min(samples_per_class.values())
+        self.labels = labels
+        self.samples_per_class = samples_per_class
+        self.length = self.samples_per_class * len(set(labels))
+    def __iter__(self) -> Iterator[int]:
+        """
+        Returns:
+            iterator of indices of stratified sample
+        """
+        indices = []
+        for key in sorted(self.lbl2idx):
+            replace_flag = self.samples_per_class > len(self.lbl2idx[key])
+            indices += np.random.choice(
+                self.lbl2idx[key], self.samples_per_class, replace=replace_flag
+            ).tolist()
+        assert len(indices) == self.length
+        np.random.shuffle(indices)
+        return iter(indices)
+    def __len__(self) -> int:
+        """
+        Returns:
+             length of result sample
+        """
+        return self.length
+class BatchBalanceClassSampler(Sampler):
+    """
+    This kind of sampler can be used for both metric learning and classification task.
+    BatchSampler with the given strategy for the C unique classes dataset:
+    - Selection `num_classes` of C classes for each batch
+    - Selection `num_samples` instances for each class in the batch
+    The epoch ends after `num_batches`.
+    So, the batch sise is `num_classes` * `num_samples`.
+    One of the purposes of this sampler is to be used for
+    forming triplets and pos/neg pairs inside the batch.
+    To guarante existance of these pairs in the batch,
+    `num_classes` and `num_samples` should be > 1. (1)
+    This type of sampling can be found in the classical paper of Person Re-Id,
+    where P (`num_classes`) equals 32 and K (`num_samples`) equals 4:
+    `In Defense of the Triplet Loss for Person Re-Identification`_.
+    Args:
+        labels: list of classes labeles for each elem in the dataset
+        num_classes: number of classes in a batch, should be > 1
+        num_samples: number of instances of each class in a batch, should be > 1
+        num_batches: number of batches in epoch
+            (default = len(labels) // (num_classes * num_samples))
+    .. _In Defense of the Triplet Loss for Person Re-Identification:
+        https://arxiv.org/abs/1703.07737
+    Python API examples:
+    .. code-block:: python
+        import os
+        from torch import nn, optim
+        from torch.utils.data import DataLoader
+        from catalyst import dl
+        from catalyst.data import ToTensor, BatchBalanceClassSampler
+        from catalyst.contrib.datasets import MNIST
+        train_data = MNIST(os.getcwd(), train=True, download=True)
+        train_labels = train_data.targets.cpu().numpy().tolist()
+        train_sampler = BatchBalanceClassSampler(
+            train_labels, num_classes=10, num_samples=4)
+        valid_data = MNIST(os.getcwd(), train=False)
+        loaders = {
+            "train": DataLoader(train_data, batch_sampler=train_sampler),
+            "valid": DataLoader(valid_data, batch_size=32),
+        }
+        model = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10))
+        criterion = nn.CrossEntropyLoss()
+        optimizer = optim.Adam(model.parameters(), lr=0.02)
+        runner = dl.SupervisedRunner()
+        # model training
+        runner.train(
+            model=model,
+            criterion=criterion,
+            optimizer=optimizer,
+            loaders=loaders,
+            num_epochs=1,
+            logdir="./logs",
+            valid_loader="valid",
+            valid_metric="loss",
+            minimize_valid_metric=True,
+            verbose=True,
+        )
+    """
+    def __init__(
+        self,
+        labels: Union[List[int], np.ndarray],
+        num_classes: int,
+        num_samples: int,
+        num_batches: int = None,
+    ):
+        """Sampler initialisation."""
+        super().__init__(labels)
+        classes = set(labels)
+        assert isinstance(num_classes, int) and isinstance(num_samples, int)
+        assert (1 < num_classes <= len(classes)) and (1 < num_samples)
+        assert all(
+            n > 1 for n in Counter(labels).values()
+        ), "Each class shoud contain at least 2 instances to fit (1)"
+        labels = np.array(labels)
+        self._labels = list(set(labels.tolist()))
+        self._num_classes = num_classes
+        self._num_samples = num_samples
+        self._batch_size = self._num_classes * self._num_samples
+        self._num_batches = num_batches or len(labels) // self._batch_size
+        self.lbl2idx = {
+            label: np.arange(len(labels))[labels == label].tolist()
+            for label in set(labels)
+        }
+    @property
+    def batch_size(self) -> int:
+        """
+        Returns:
+            this value should be used in DataLoader as batch size
+        """
+        return self._batch_size
+    @property
+    def batches_in_epoch(self) -> int:
+        """
+        Returns:
+            number of batches in an epoch
+        """
+        return self._num_batches
+    def __len__(self) -> int:
+        """
+        Returns:
+            number of samples in an epoch
+        """
+        return self._num_batches  # * self._batch_size
+    def __iter__(self) -> Iterator[int]:
+        """
+        Returns:
+            indeces for sampling dataset elems during an epoch
+        """
+        indices = []
+        for _ in range(self._num_batches):
+            batch_indices = []
+            classes_for_batch = random.sample(self._labels, self._num_classes)
+            while self._num_classes != len(set(classes_for_batch)):
+                classes_for_batch = random.sample(self._labels, self._num_classes)
+            for cls_id in classes_for_batch:
+                replace_flag = self._num_samples > len(self.lbl2idx[cls_id])
+                batch_indices += np.random.choice(
+                    self.lbl2idx[cls_id], self._num_samples, replace=replace_flag
+                ).tolist()
+            indices.append(batch_indices)
+        return iter(indices)
+class DynamicBalanceClassSampler(Sampler):
+    """
+    This kind of sampler can be used for classification tasks with significant
+    class imbalance.
+    The idea of this sampler that we start with the original class distribution
+    and gradually move to uniform class distribution like with downsampling.
+    Let's define :math: D_i = #C_i/ #C_min where :math: #C_i is a size of class
+    i and :math: #C_min is a size of the rarest class, so :math: D_i define
+    class distribution. Also define :math: g(n_epoch) is a exponential
+    scheduler. On each epoch current :math: D_i  calculated as
+    :math: current D_i  = D_i ^ g(n_epoch),
+    after this data samples according this distribution.
+    Notes:
+         In the end of the training, epochs will contain only
+         min_size_class * n_classes examples. So, possible it will not
+         necessary to do validation on each epoch. For this reason use
+         ControlFlowCallback.
+    Examples:
+        >>> import torch
+        >>> import numpy as np
+        >>> from catalyst.data import DynamicBalanceClassSampler
+        >>> from torch.utils import data
+        >>> features = torch.Tensor(np.random.random((200, 100)))
+        >>> labels = np.random.randint(0, 4, size=(200,))
+        >>> sampler = DynamicBalanceClassSampler(labels)
+        >>> labels = torch.LongTensor(labels)
+        >>> dataset = data.TensorDataset(features, labels)
+        >>> loader = data.dataloader.DataLoader(dataset, batch_size=8)
+        >>> for batch in loader:
+        >>>     b_features, b_labels = batch
+    Sampler was inspired by https://arxiv.org/abs/1901.06783
+    """
+    def __init__(
+        self,
+        labels: List[Union[int, str]],
+        exp_lambda: float = 0.9,
+        start_epoch: int = 0,
+        max_d: Optional[int] = None,
+        mode: Union[str, int] = "downsampling",
+        ignore_warning: bool = False,
+    ):
+        """
+        Args:
+            labels: list of labels for each elem in the dataset
+            exp_lambda: exponent figure for schedule
+            start_epoch: start epoch number, can be useful for multi-stage
+            experiments
+            max_d: if not None, limit on the difference between the most
+            frequent and the rarest classes, heuristic
+            mode: number of samples per class in the end of training. Must be
+            "downsampling" or number. Before change it, make sure that you
+            understand how does it work
+            ignore_warning: ignore warning about min class size
+        """
+        assert isinstance(start_epoch, int)
+        assert 0 < exp_lambda < 1, "exp_lambda must be in (0, 1)"
+        super().__init__(labels)
+        self.exp_lambda = exp_lambda
+        if max_d is None:
+            max_d = np.inf
+        self.max_d = max_d
+        self.epoch = start_epoch
+        labels = np.array(labels)
+        samples_per_class = Counter(labels)
+        self.min_class_size = min(samples_per_class.values())
+        if self.min_class_size < 100 and not ignore_warning:
+            LOGGER.warning(
+                f"the smallest class contains only"
+                f" {self.min_class_size} examples. At the end of"
+                f" training, epochs will contain only"
+                f" {self.min_class_size * len(samples_per_class)}"
+                f" examples"
+            )
+        self.original_d = {
+            key: value / self.min_class_size for key, value in samples_per_class.items()
+        }
+        self.label2idxes = {
+            label: np.arange(len(labels))[labels == label].tolist()
+            for label in set(labels)
+        }
+        if isinstance(mode, int):
+            self.min_class_size = mode
+        else:
+            assert mode == "downsampling"
+        self.labels = labels
+        self._update()
+    def _update(self) -> None:
+        """Update d coefficients."""
+        current_d = {
+            key: min(value ** self._exp_scheduler(), self.max_d)
+            for key, value in self.original_d.items()
+        }
+        samples_per_classes = {
+            key: int(value * self.min_class_size) for key, value in current_d.items()
+        }
+        self.samples_per_classes = samples_per_classes
+        self.length = np.sum(list(samples_per_classes.values()))
+        self.epoch += 1
+    def _exp_scheduler(self) -> float:
+        return self.exp_lambda**self.epoch
+    def __iter__(self) -> Iterator[int]:
+        """
+        Returns:
+            iterator of indices of stratified sample
+        """
+        indices = []
+        for key in sorted(self.label2idxes):
+            samples_per_class = self.samples_per_classes[key]
+            replace_flag = samples_per_class > len(self.label2idxes[key])
+            indices += np.random.choice(
+                self.label2idxes[key], samples_per_class, replace=replace_flag
+            ).tolist()
+        assert len(indices) == self.length
+        np.random.shuffle(indices)
+        self._update()
+        return iter(indices)
+    def __len__(self) -> int:
+        """
+        Returns:
+             length of result sample
+        """
+        return self.length
+class MiniEpochSampler(Sampler):
+    """
+    Sampler iterates mini epochs from the dataset used by ``mini_epoch_len``.
+    Args:
+        data_len: Size of the dataset
+        mini_epoch_len: Num samples from the dataset used in one
+          mini epoch.
+        drop_last: If ``True``, sampler will drop the last batches
+          if its size would be less than ``batches_per_epoch``
+        shuffle: one of  ``"always"``, ``"real_epoch"``, or `None``.
+          The sampler will shuffle indices
+          > "per_mini_epoch" - every mini epoch (every ``__iter__`` call)
+          > "per_epoch" -- every real epoch
+          > None -- don't shuffle
+    Example:
+        >>> MiniEpochSampler(len(dataset), mini_epoch_len=100)
+        >>> MiniEpochSampler(len(dataset), mini_epoch_len=100, drop_last=True)
+        >>> MiniEpochSampler(len(dataset), mini_epoch_len=100,
+        >>>     shuffle="per_epoch")
+    """
+    def __init__(
+        self,
+        data_len: int,
+        mini_epoch_len: int,
+        drop_last: bool = False,
+        shuffle: str = None,
+    ):
+        """Sampler initialisation."""
+        super().__init__(None)
+        self.data_len = int(data_len)
+        self.mini_epoch_len = int(mini_epoch_len)
+        self.steps = int(data_len / self.mini_epoch_len)
+        self.state_i = 0
+        has_reminder = data_len - self.steps * mini_epoch_len > 0
+        if self.steps == 0:
+            self.divider = 1
+        elif has_reminder and not drop_last:
+            self.divider = self.steps + 1
+        else:
+            self.divider = self.steps
+        self._indices = np.arange(self.data_len)
+        self.indices = self._indices
+        self.end_pointer = max(self.data_len, self.mini_epoch_len)
+        if not (shuffle is None or shuffle in ["per_mini_epoch", "per_epoch"]):
+            raise ValueError(
+                "Shuffle must be one of ['per_mini_epoch', 'per_epoch']. "
+                + f"Got {shuffle}"
+            )
+        self.shuffle_type = shuffle
+    def shuffle(self) -> None:
+        """Shuffle sampler indices."""
+        if self.shuffle_type == "per_mini_epoch" or (
+            self.shuffle_type == "per_epoch" and self.state_i == 0
+        ):
+            if self.data_len >= self.mini_epoch_len:
+                self.indices = self._indices
+                np.random.shuffle(self.indices)
+            else:
+                self.indices = np.random.choice(
+                    self._indices, self.mini_epoch_len, replace=True
+                )
+    def __iter__(self) -> Iterator[int]:
+        """Iterate over sampler.
+        Returns:
+            python iterator
+        """
+        self.state_i = self.state_i % self.divider
+        self.shuffle()
+        start = self.state_i * self.mini_epoch_len
+        stop = (
+            self.end_pointer
+            if (self.state_i == self.steps)
+            else (self.state_i + 1) * self.mini_epoch_len
+        )
+        indices = self.indices[start:stop].tolist()
+        self.state_i += 1
+        return iter(indices)
+    def __len__(self) -> int:
+        """
+        Returns:
+            int: length of the mini-epoch
+        """
+        return self.mini_epoch_len
+class DistributedSamplerWrapper(DistributedSampler):
+    """
+    Wrapper over `Sampler` for distributed training.
+    Allows you to use any sampler in distributed mode.
+    It is especially useful in conjunction with
+    `torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSamplerWrapper instance as a DataLoader
+    sampler, and load a subset of subsampled data of the original dataset
+    that is exclusive to it.
+    .. note::
+        Sampler is assumed to be of constant size.
+    """
+    def __init__(
+        self,
+        sampler,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+    ):
+        """
+        Args:
+            sampler: Sampler used for subsampling
+            num_replicas (int, optional): Number of processes participating in
+                distributed training
+            rank (int, optional): Rank of the current process
+                within ``num_replicas``
+            shuffle (bool, optional): If true (default),
+                sampler will shuffle the indices
+        """
+        super(DistributedSamplerWrapper, self).__init__(
+            DatasetFromSampler(sampler),
+            num_replicas=num_replicas,
+            rank=rank,
+            shuffle=shuffle,
+        )
+        self.sampler = sampler
+    def __iter__(self) -> Iterator[int]:
+        """Iterate over sampler.
+        Returns:
+            python iterator
+        """
+        self.dataset = DatasetFromSampler(self.sampler)
+        indexes_of_indexes = super().__iter__()
+        subsampler_indexes = self.dataset
+        return iter(itemgetter(*indexes_of_indexes)(subsampler_indexes))
+__all__ = [
+    "BalanceClassSampler",
+    "BatchBalanceClassSampler",
+    "DistributedSamplerWrapper",
+    "DynamicBalanceClassSampler",
+    "MiniEpochSampler",
+]

utilities/tools.py ADDED Viewed

	@@ -0,0 +1,541 @@

+# Author: Haohe Liu
+# Email: [email protected]
+# Date: 11 Feb 2023
+import os
+import json
+import torch
+import torch.nn.functional as F
+import numpy as np
+import matplotlib
+from scipy.io import wavfile
+from matplotlib import pyplot as plt
+matplotlib.use("Agg")
+import hashlib
+import os
+import requests
+from tqdm import tqdm
+URL_MAP = {
+    "vggishish_lpaps": "https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/specvqgan_public/vggishish16.pt",
+    "vggishish_mean_std_melspec_10s_22050hz": "https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/specvqgan_public/train_means_stds_melspec_10s_22050hz.txt",
+    "melception": "https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a/specvqgan_public/melception-21-05-10T09-28-40.pt",
+}
+CKPT_MAP = {
+    "vggishish_lpaps": "vggishish16.pt",
+    "vggishish_mean_std_melspec_10s_22050hz": "train_means_stds_melspec_10s_22050hz.txt",
+    "melception": "melception-21-05-10T09-28-40.pt",
+}
+MD5_MAP = {
+    "vggishish_lpaps": "197040c524a07ccacf7715d7080a80bd",
+    "vggishish_mean_std_melspec_10s_22050hz": "f449c6fd0e248936c16f6d22492bb625",
+    "melception": "a71a41041e945b457c7d3d814bbcf72d",
+}
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def load_json(fname):
+    with open(fname, "r") as f:
+        data = json.load(f)
+        return data
+def read_json(dataset_json_file):
+    with open(dataset_json_file, "r") as fp:
+        data_json = json.load(fp)
+    return data_json["data"]
+def copy_test_subset_data(metadata, testset_copy_target_path):
+    # metadata = read_json(testset_metadata)
+    os.makedirs(testset_copy_target_path, exist_ok=True)
+    if len(os.listdir(testset_copy_target_path)) == len(metadata):
+        return
+    else:
+        # delete files in folder testset_copy_target_path
+        for file in os.listdir(testset_copy_target_path):
+            try:
+                os.remove(os.path.join(testset_copy_target_path, file))
+            except Exception as e:
+                print(e)
+    print("Copying test subset data to {}".format(testset_copy_target_path))
+    for each in tqdm(metadata):
+        cmd = "cp {} {}".format(each["wav"], os.path.join(testset_copy_target_path))
+        os.system(cmd)
+def listdir_nohidden(path):
+    for f in os.listdir(path):
+        if not f.startswith("."):
+            yield f
+def get_restore_step(path):
+    checkpoints = os.listdir(path)
+    if os.path.exists(os.path.join(path, "final.ckpt")):
+        return "final.ckpt", 0
+    elif not os.path.exists(os.path.join(path, "last.ckpt")):
+        steps = [int(x.split(".ckpt")[0].split("step=")[1]) for x in checkpoints]
+        return checkpoints[np.argmax(steps)], np.max(steps)
+    else:
+        steps = []
+        for x in checkpoints:
+            if "last" in x:
+                if "-v" not in x:
+                    fname = "last.ckpt"
+                else:
+                    this_version = int(x.split(".ckpt")[0].split("-v")[1])
+                    steps.append(this_version)
+                    if len(steps) == 0 or this_version > np.max(steps):
+                        fname = "last-v%s.ckpt" % this_version
+        return fname, 0
+def download(url, local_path, chunk_size=1024):
+    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        total_size = int(r.headers.get("content-length", 0))
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            with open(local_path, "wb") as f:
+                for data in r.iter_content(chunk_size=chunk_size):
+                    if data:
+                        f.write(data)
+                        pbar.update(chunk_size)
+def md5_hash(path):
+    with open(path, "rb") as f:
+        content = f.read()
+    return hashlib.md5(content).hexdigest()
+def get_ckpt_path(name, root, check=False):
+    assert name in URL_MAP
+    path = os.path.join(root, CKPT_MAP[name])
+    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+        download(URL_MAP[name], path)
+        md5 = md5_hash(path)
+        assert md5 == MD5_MAP[name], md5
+    return path
+class KeyNotFoundError(Exception):
+    def __init__(self, cause, keys=None, visited=None):
+        self.cause = cause
+        self.keys = keys
+        self.visited = visited
+        messages = list()
+        if keys is not None:
+            messages.append("Key not found: {}".format(keys))
+        if visited is not None:
+            messages.append("Visited: {}".format(visited))
+        messages.append("Cause:\n{}".format(cause))
+        message = "\n".join(messages)
+        super().__init__(message)
+def retrieve(
+    list_or_dict, key, splitval="/", default=None, expand=True, pass_success=False
+):
+    """Given a nested list or dict return the desired value at key expanding
+    callable nodes if necessary and :attr:`expand` is ``True``. The expansion
+    is done in-place.
+    Parameters
+    ----------
+        list_or_dict : list or dict
+            Possibly nested list or dictionary.
+        key : str
+            key/to/value, path like string describing all keys necessary to
+            consider to get to the desired value. List indices can also be
+            passed here.
+        splitval : str
+            String that defines the delimiter between keys of the
+            different depth levels in `key`.
+        default : obj
+            Value returned if :attr:`key` is not found.
+        expand : bool
+            Whether to expand callable nodes on the path or not.
+    Returns
+    -------
+        The desired value or if :attr:`default` is not ``None`` and the
+        :attr:`key` is not found returns ``default``.
+    Raises
+    ------
+        Exception if ``key`` not in ``list_or_dict`` and :attr:`default` is
+        ``None``.
+    """
+    keys = key.split(splitval)
+    success = True
+    try:
+        visited = []
+        parent = None
+        last_key = None
+        for key in keys:
+            if callable(list_or_dict):
+                if not expand:
+                    raise KeyNotFoundError(
+                        ValueError(
+                            "Trying to get past callable node with expand=False."
+                        ),
+                        keys=keys,
+                        visited=visited,
+                    )
+                list_or_dict = list_or_dict()
+                parent[last_key] = list_or_dict
+            last_key = key
+            parent = list_or_dict
+            try:
+                if isinstance(list_or_dict, dict):
+                    list_or_dict = list_or_dict[key]
+                else:
+                    list_or_dict = list_or_dict[int(key)]
+            except (KeyError, IndexError, ValueError) as e:
+                raise KeyNotFoundError(e, keys=keys, visited=visited)
+            visited += [key]
+        # final expansion of retrieved value
+        if expand and callable(list_or_dict):
+            list_or_dict = list_or_dict()
+            parent[last_key] = list_or_dict
+    except KeyNotFoundError as e:
+        if default is None:
+            raise e
+        else:
+            list_or_dict = default
+            success = False
+    if not pass_success:
+        return list_or_dict
+    else:
+        return list_or_dict, success
+def to_device(data, device):
+    if len(data) == 12:
+        (
+            ids,
+            raw_texts,
+            speakers,
+            texts,
+            src_lens,
+            max_src_len,
+            mels,
+            mel_lens,
+            max_mel_len,
+            pitches,
+            energies,
+            durations,
+        ) = data
+        speakers = torch.from_numpy(speakers).long().to(device)
+        texts = torch.from_numpy(texts).long().to(device)
+        src_lens = torch.from_numpy(src_lens).to(device)
+        mels = torch.from_numpy(mels).float().to(device)
+        mel_lens = torch.from_numpy(mel_lens).to(device)
+        pitches = torch.from_numpy(pitches).float().to(device)
+        energies = torch.from_numpy(energies).to(device)
+        durations = torch.from_numpy(durations).long().to(device)
+        return (
+            ids,
+            raw_texts,
+            speakers,
+            texts,
+            src_lens,
+            max_src_len,
+            mels,
+            mel_lens,
+            max_mel_len,
+            pitches,
+            energies,
+            durations,
+        )
+    if len(data) == 6:
+        (ids, raw_texts, speakers, texts, src_lens, max_src_len) = data
+        speakers = torch.from_numpy(speakers).long().to(device)
+        texts = torch.from_numpy(texts).long().to(device)
+        src_lens = torch.from_numpy(src_lens).to(device)
+        return (ids, raw_texts, speakers, texts, src_lens, max_src_len)
+def log(logger, step=None, fig=None, audio=None, sampling_rate=22050, tag=""):
+    # if losses is not None:
+    #     logger.add_scalar("Loss/total_loss", losses[0], step)
+    #     logger.add_scalar("Loss/mel_loss", losses[1], step)
+    #     logger.add_scalar("Loss/mel_postnet_loss", losses[2], step)
+    #     logger.add_scalar("Loss/pitch_loss", losses[3], step)
+    #     logger.add_scalar("Loss/energy_loss", losses[4], step)
+    #     logger.add_scalar("Loss/duration_loss", losses[5], step)
+    #     if(len(losses) > 6):
+    #         logger.add_scalar("Loss/disc_loss", losses[6], step)
+    #         logger.add_scalar("Loss/fmap_loss", losses[7], step)
+    #         logger.add_scalar("Loss/r_loss", losses[8], step)
+    #         logger.add_scalar("Loss/g_loss", losses[9], step)
+    #         logger.add_scalar("Loss/gen_loss", losses[10], step)
+    #         logger.add_scalar("Loss/diff_loss", losses[11], step)
+    if fig is not None:
+        logger.add_figure(tag, fig)
+    if audio is not None:
+        audio = audio / (max(abs(audio)) * 1.1)
+        logger.add_audio(
+            tag,
+            audio,
+            sample_rate=sampling_rate,
+        )
+def get_mask_from_lengths(lengths, max_len=None):
+    batch_size = lengths.shape[0]
+    if max_len is None:
+        max_len = torch.max(lengths).item()
+    ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(device)
+    mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
+    return mask
+def expand(values, durations):
+    out = list()
+    for value, d in zip(values, durations):
+        out += [value] * max(0, int(d))
+    return np.array(out)
+def synth_one_sample_val(
+    targets, predictions, vocoder, model_config, preprocess_config
+):
+    index = np.random.choice(list(np.arange(targets[6].size(0))))
+    basename = targets[0][index]
+    src_len = predictions[8][index].item()
+    mel_len = predictions[9][index].item()
+    mel_target = targets[6][index, :mel_len].detach().transpose(0, 1)
+    mel_prediction = predictions[0][index, :mel_len].detach().transpose(0, 1)
+    postnet_mel_prediction = predictions[1][index, :mel_len].detach().transpose(0, 1)
+    duration = targets[11][index, :src_len].detach().cpu().numpy()
+    if preprocess_config["preprocessing"]["pitch"]["feature"] == "phoneme_level":
+        pitch = predictions[2][index, :src_len].detach().cpu().numpy()
+        pitch = expand(pitch, duration)
+    else:
+        pitch = predictions[2][index, :mel_len].detach().cpu().numpy()
+    if preprocess_config["preprocessing"]["energy"]["feature"] == "phoneme_level":
+        energy = predictions[3][index, :src_len].detach().cpu().numpy()
+        energy = expand(energy, duration)
+    else:
+        energy = predictions[3][index, :mel_len].detach().cpu().numpy()
+    with open(
+        os.path.join(preprocess_config["path"]["preprocessed_path"], "stats.json")
+    ) as f:
+        stats = json.load(f)
+        stats = stats["pitch"] + stats["energy"][:2]
+    # from datetime import datetime
+    # now = datetime.now()
+    # current_time = now.strftime("%D:%H:%M:%S")
+    # np.save(("mel_pred_%s.npy" % current_time).replace("/","-"), mel_prediction.cpu().numpy())
+    # np.save(("postnet_mel_prediction_%s.npy" % current_time).replace("/","-"), postnet_mel_prediction.cpu().numpy())
+    # np.save(("mel_target_%s.npy" % current_time).replace("/","-"), mel_target.cpu().numpy())
+    fig = plot_mel(
+        [
+            (mel_prediction.cpu().numpy(), pitch, energy),
+            (postnet_mel_prediction.cpu().numpy(), pitch, energy),
+            (mel_target.cpu().numpy(), pitch, energy),
+        ],
+        stats,
+        [
+            "Raw mel spectrogram prediction",
+            "Postnet mel prediction",
+            "Ground-Truth Spectrogram",
+        ],
+    )
+    if vocoder is not None:
+        from .model import vocoder_infer
+        wav_reconstruction = vocoder_infer(
+            mel_target.unsqueeze(0),
+            vocoder,
+            model_config,
+            preprocess_config,
+        )[0]
+        wav_prediction = vocoder_infer(
+            postnet_mel_prediction.unsqueeze(0),
+            vocoder,
+            model_config,
+            preprocess_config,
+        )[0]
+    else:
+        wav_reconstruction = wav_prediction = None
+    return fig, wav_reconstruction, wav_prediction, basename
+def synth_one_sample(mel_input, mel_prediction, labels, vocoder):
+    if vocoder is not None:
+        from .model import vocoder_infer
+        wav_reconstruction = vocoder_infer(
+            mel_input.permute(0, 2, 1),
+            vocoder,
+        )
+        wav_prediction = vocoder_infer(
+            mel_prediction.permute(0, 2, 1),
+            vocoder,
+        )
+    else:
+        wav_reconstruction = wav_prediction = None
+    return wav_reconstruction, wav_prediction
+def synth_samples(targets, predictions, vocoder, model_config, preprocess_config, path):
+    # (diff_output, diff_loss, latent_loss) = diffusion
+    basenames = targets[0]
+    for i in range(len(predictions[1])):
+        basename = basenames[i]
+        src_len = predictions[8][i].item()
+        mel_len = predictions[9][i].item()
+        mel_prediction = predictions[1][i, :mel_len].detach().transpose(0, 1)
+        # diff_output = diff_output[i, :mel_len].detach().transpose(0, 1)
+        # duration = predictions[5][i, :src_len].detach().cpu().numpy()
+        if preprocess_config["preprocessing"]["pitch"]["feature"] == "phoneme_level":
+            pitch = predictions[2][i, :src_len].detach().cpu().numpy()
+            # pitch = expand(pitch, duration)
+        else:
+            pitch = predictions[2][i, :mel_len].detach().cpu().numpy()
+        if preprocess_config["preprocessing"]["energy"]["feature"] == "phoneme_level":
+            energy = predictions[3][i, :src_len].detach().cpu().numpy()
+            # energy = expand(energy, duration)
+        else:
+            energy = predictions[3][i, :mel_len].detach().cpu().numpy()
+        # import ipdb; ipdb.set_trace()
+        with open(
+            os.path.join(preprocess_config["path"]["preprocessed_path"], "stats.json")
+        ) as f:
+            stats = json.load(f)
+            stats = stats["pitch"] + stats["energy"][:2]
+        fig = plot_mel(
+            [
+                (mel_prediction.cpu().numpy(), pitch, energy),
+            ],
+            stats,
+            ["Synthetized Spectrogram by PostNet"],
+        )
+        # np.save("{}_postnet.npy".format(basename), mel_prediction.cpu().numpy())
+        plt.savefig(os.path.join(path, "{}_postnet_2.png".format(basename)))
+        plt.close()
+    from .model import vocoder_infer
+    mel_predictions = predictions[1].transpose(1, 2)
+    lengths = predictions[9] * preprocess_config["preprocessing"]["stft"]["hop_length"]
+    wav_predictions = vocoder_infer(
+        mel_predictions, vocoder, model_config, preprocess_config, lengths=lengths
+    )
+    sampling_rate = preprocess_config["preprocessing"]["audio"]["sampling_rate"]
+    for wav, basename in zip(wav_predictions, basenames):
+        wavfile.write(os.path.join(path, "{}.wav".format(basename)), sampling_rate, wav)
+def plot_mel(data, titles=None):
+    fig, axes = plt.subplots(len(data), 1, squeeze=False)
+    if titles is None:
+        titles = [None for i in range(len(data))]
+    for i in range(len(data)):
+        mel = data[i]
+        axes[i][0].imshow(mel, origin="lower", aspect="auto")
+        axes[i][0].set_aspect(2.5, adjustable="box")
+        axes[i][0].set_ylim(0, mel.shape[0])
+        axes[i][0].set_title(titles[i], fontsize="medium")
+        axes[i][0].tick_params(labelsize="x-small", left=False, labelleft=False)
+        axes[i][0].set_anchor("W")
+    return fig
+def pad_1D(inputs, PAD=0):
+    def pad_data(x, length, PAD):
+        x_padded = np.pad(
+            x, (0, length - x.shape[0]), mode="constant", constant_values=PAD
+        )
+        return x_padded
+    max_len = max((len(x) for x in inputs))
+    padded = np.stack([pad_data(x, max_len, PAD) for x in inputs])
+    return padded
+def pad_2D(inputs, maxlen=None):
+    def pad(x, max_len):
+        PAD = 0
+        if np.shape(x)[0] > max_len:
+            raise ValueError("not max_len")
+        s = np.shape(x)[1]
+        x_padded = np.pad(
+            x, (0, max_len - np.shape(x)[0]), mode="constant", constant_values=PAD
+        )
+        return x_padded[:, :s]
+    if maxlen:
+        output = np.stack([pad(x, maxlen) for x in inputs])
+    else:
+        max_len = max(np.shape(x)[0] for x in inputs)
+        output = np.stack([pad(x, max_len) for x in inputs])
+    return output
+def pad(input_ele, mel_max_length=None):
+    if mel_max_length:
+        max_len = mel_max_length
+    else:
+        max_len = max([input_ele[i].size(0) for i in range(len(input_ele))])
+    out_list = list()
+    for i, batch in enumerate(input_ele):
+        if len(batch.shape) == 1:
+            one_batch_padded = F.pad(
+                batch, (0, max_len - batch.size(0)), "constant", 0.0
+            )
+        elif len(batch.shape) == 2:
+            one_batch_padded = F.pad(
+                batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0
+            )
+        out_list.append(one_batch_padded)
+    out_padded = torch.stack(out_list)
+    return out_padded