BangDream-Bert-VITS2

Starting

App Files Files Community

Mahiruoshi commited on Dec 25, 2023

Commit

42f7394

1 Parent(s): 805bf42

Delete onnx_modules

Browse files

Files changed (26) hide show

onnx_modules/V200/__init__.py +0 -0
onnx_modules/V200/attentions_onnx.py +0 -378
onnx_modules/V200/models_onnx.py +0 -990
onnx_modules/V200/text/__init__.py +0 -1
onnx_modules/V200/text/bert_utils.py +0 -23
onnx_modules/V200/text/chinese.py +0 -198
onnx_modules/V200/text/chinese_bert.py +0 -101
onnx_modules/V200/text/cleaner.py +0 -28
onnx_modules/V200/text/english.py +0 -362
onnx_modules/V200/text/english_bert_mock.py +0 -42
onnx_modules/V200/text/japanese.py +0 -403
onnx_modules/V200/text/japanese_bert.py +0 -58
onnx_modules/V200/text/opencpop-strict.txt +0 -429
onnx_modules/V200/text/symbols.py +0 -187
onnx_modules/V200/text/tone_sandhi.py +0 -769
onnx_modules/V210/__init__.py +0 -0
onnx_modules/V210/attentions_onnx.py +0 -378
onnx_modules/V210/models_onnx.py +0 -1044
onnx_modules/V210/text/__init__.py +0 -1
onnx_modules/V210/text/symbols.py +0 -187
onnx_modules/V220/__init__.py +0 -0
onnx_modules/V220/attentions_onnx.py +0 -378
onnx_modules/V220/models_onnx.py +0 -1076
onnx_modules/V220/text/__init__.py +0 -1
onnx_modules/V220/text/symbols.py +0 -187
onnx_modules/__init__.py +0 -50

onnx_modules/V200/__init__.py DELETED Viewed

File without changes

onnx_modules/V200/attentions_onnx.py DELETED Viewed

@@ -1,378 +0,0 @@
-import math
-import torch
-from torch import nn
-from torch.nn import functional as F
-import commons
-import logging
-logger = logging.getLogger(__name__)
-class LayerNorm(nn.Module):
-    def __init__(self, channels, eps=1e-5):
-        super().__init__()
-        self.channels = channels
-        self.eps = eps
-        self.gamma = nn.Parameter(torch.ones(channels))
-        self.beta = nn.Parameter(torch.zeros(channels))
-    def forward(self, x):
-        x = x.transpose(1, -1)
-        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
-        return x.transpose(1, -1)
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-    n_channels_int = n_channels[0]
-    in_act = input_a + input_b
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size=1,
-        p_dropout=0.0,
-        window_size=4,
-        isflow=True,
-        **kwargs
-    ):
-        super().__init__()
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        # if isflow:
-        #  cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
-        #  self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
-        #  self.cond_layer = weight_norm(cond_layer, name='weight')
-        #  self.gin_channels = 256
-        self.cond_layer_idx = self.n_layers
-        if "gin_channels" in kwargs:
-            self.gin_channels = kwargs["gin_channels"]
-            if self.gin_channels != 0:
-                self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
-                # vits2 says 3rd block, so idx is 2 by default
-                self.cond_layer_idx = (
-                    kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
-                )
-                logging.debug(self.gin_channels, self.cond_layer_idx)
-                assert (
-                    self.cond_layer_idx < self.n_layers
-                ), "cond_layer_idx should be less than n_layers"
-        self.drop = nn.Dropout(p_dropout)
-        self.attn_layers = nn.ModuleList()
-        self.norm_layers_1 = nn.ModuleList()
-        self.ffn_layers = nn.ModuleList()
-        self.norm_layers_2 = nn.ModuleList()
-        for i in range(self.n_layers):
-            self.attn_layers.append(
-                MultiHeadAttention(
-                    hidden_channels,
-                    hidden_channels,
-                    n_heads,
-                    p_dropout=p_dropout,
-                    window_size=window_size,
-                )
-            )
-            self.norm_layers_1.append(LayerNorm(hidden_channels))
-            self.ffn_layers.append(
-                FFN(
-                    hidden_channels,
-                    hidden_channels,
-                    filter_channels,
-                    kernel_size,
-                    p_dropout=p_dropout,
-                )
-            )
-            self.norm_layers_2.append(LayerNorm(hidden_channels))
-    def forward(self, x, x_mask, g=None):
-        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-        x = x * x_mask
-        for i in range(self.n_layers):
-            if i == self.cond_layer_idx and g is not None:
-                g = self.spk_emb_linear(g.transpose(1, 2))
-                g = g.transpose(1, 2)
-                x = x + g
-                x = x * x_mask
-            y = self.attn_layers[i](x, x, attn_mask)
-            y = self.drop(y)
-            x = self.norm_layers_1[i](x + y)
-            y = self.ffn_layers[i](x, x_mask)
-            y = self.drop(y)
-            x = self.norm_layers_2[i](x + y)
-        x = x * x_mask
-        return x
-class MultiHeadAttention(nn.Module):
-    def __init__(
-        self,
-        channels,
-        out_channels,
-        n_heads,
-        p_dropout=0.0,
-        window_size=None,
-        heads_share=True,
-        block_length=None,
-        proximal_bias=False,
-        proximal_init=False,
-    ):
-        super().__init__()
-        assert channels % n_heads == 0
-        self.channels = channels
-        self.out_channels = out_channels
-        self.n_heads = n_heads
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        self.heads_share = heads_share
-        self.block_length = block_length
-        self.proximal_bias = proximal_bias
-        self.proximal_init = proximal_init
-        self.attn = None
-        self.k_channels = channels // n_heads
-        self.conv_q = nn.Conv1d(channels, channels, 1)
-        self.conv_k = nn.Conv1d(channels, channels, 1)
-        self.conv_v = nn.Conv1d(channels, channels, 1)
-        self.conv_o = nn.Conv1d(channels, out_channels, 1)
-        self.drop = nn.Dropout(p_dropout)
-        if window_size is not None:
-            n_heads_rel = 1 if heads_share else n_heads
-            rel_stddev = self.k_channels**-0.5
-            self.emb_rel_k = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
-                * rel_stddev
-            )
-            self.emb_rel_v = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
-                * rel_stddev
-            )
-        nn.init.xavier_uniform_(self.conv_q.weight)
-        nn.init.xavier_uniform_(self.conv_k.weight)
-        nn.init.xavier_uniform_(self.conv_v.weight)
-        if proximal_init:
-            with torch.no_grad():
-                self.conv_k.weight.copy_(self.conv_q.weight)
-                self.conv_k.bias.copy_(self.conv_q.bias)
-    def forward(self, x, c, attn_mask=None):
-        q = self.conv_q(x)
-        k = self.conv_k(c)
-        v = self.conv_v(c)
-        x, self.attn = self.attention(q, k, v, mask=attn_mask)
-        x = self.conv_o(x)
-        return x
-    def attention(self, query, key, value, mask=None):
-        # reshape [b, d, t] -> [b, n_h, t, d_k]
-        b, d, t_s, t_t = (*key.size(), query.size(2))
-        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
-        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
-        if self.window_size is not None:
-            assert (
-                t_s == t_t
-            ), "Relative attention is only available for self-attention."
-            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
-            rel_logits = self._matmul_with_relative_keys(
-                query / math.sqrt(self.k_channels), key_relative_embeddings
-            )
-            scores_local = self._relative_position_to_absolute_position(rel_logits)
-            scores = scores + scores_local
-        if self.proximal_bias:
-            assert t_s == t_t, "Proximal bias is only available for self-attention."
-            scores = scores + self._attention_bias_proximal(t_s).to(
-                device=scores.device, dtype=scores.dtype
-            )
-        if mask is not None:
-            scores = scores.masked_fill(mask == 0, -1e4)
-            if self.block_length is not None:
-                assert (
-                    t_s == t_t
-                ), "Local attention is only available for self-attention."
-                block_mask = (
-                    torch.ones_like(scores)
-                    .triu(-self.block_length)
-                    .tril(self.block_length)
-                )
-                scores = scores.masked_fill(block_mask == 0, -1e4)
-        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
-        p_attn = self.drop(p_attn)
-        output = torch.matmul(p_attn, value)
-        if self.window_size is not None:
-            relative_weights = self._absolute_position_to_relative_position(p_attn)
-            value_relative_embeddings = self._get_relative_embeddings(
-                self.emb_rel_v, t_s
-            )
-            output = output + self._matmul_with_relative_values(
-                relative_weights, value_relative_embeddings
-            )
-        output = (
-            output.transpose(2, 3).contiguous().view(b, d, t_t)
-        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
-        return output, p_attn
-    def _matmul_with_relative_values(self, x, y):
-        """
-        x: [b, h, l, m]
-        y: [h or 1, m, d]
-        ret: [b, h, l, d]
-        """
-        ret = torch.matmul(x, y.unsqueeze(0))
-        return ret
-    def _matmul_with_relative_keys(self, x, y):
-        """
-        x: [b, h, l, d]
-        y: [h or 1, m, d]
-        ret: [b, h, l, m]
-        """
-        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
-        return ret
-    def _get_relative_embeddings(self, relative_embeddings, length):
-        max_relative_position = 2 * self.window_size + 1
-        # Pad first before slice to avoid using cond ops.
-        pad_length = max(length - (self.window_size + 1), 0)
-        slice_start_position = max((self.window_size + 1) - length, 0)
-        slice_end_position = slice_start_position + 2 * length - 1
-        if pad_length > 0:
-            padded_relative_embeddings = F.pad(
-                relative_embeddings,
-                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
-            )
-        else:
-            padded_relative_embeddings = relative_embeddings
-        used_relative_embeddings = padded_relative_embeddings[
-            :, slice_start_position:slice_end_position
-        ]
-        return used_relative_embeddings
-    def _relative_position_to_absolute_position(self, x):
-        """
-        x: [b, h, l, 2*l-1]
-        ret: [b, h, l, l]
-        """
-        batch, heads, length, _ = x.size()
-        # Concat columns of pad to shift from relative to absolute indexing.
-        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
-        # Concat extra elements so to add up to shape (len+1, 2*len-1).
-        x_flat = x.view([batch, heads, length * 2 * length])
-        x_flat = F.pad(
-            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
-        )
-        # Reshape and slice out the padded elements.
-        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
-            :, :, :length, length - 1 :
-        ]
-        return x_final
-    def _absolute_position_to_relative_position(self, x):
-        """
-        x: [b, h, l, l]
-        ret: [b, h, l, 2*l-1]
-        """
-        batch, heads, length, _ = x.size()
-        # padd along column
-        x = F.pad(
-            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
-        )
-        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
-        # add 0's in the beginning that will skew the elements after reshape
-        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
-        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
-        return x_final
-    def _attention_bias_proximal(self, length):
-        """Bias for self-attention to encourage attention to close positions.
-        Args:
-          length: an integer scalar.
-        Returns:
-          a Tensor with shape [1, 1, length, length]
-        """
-        r = torch.arange(length, dtype=torch.float32)
-        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
-class FFN(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        filter_channels,
-        kernel_size,
-        p_dropout=0.0,
-        activation=None,
-        causal=False,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.activation = activation
-        self.causal = causal
-        if causal:
-            self.padding = self._causal_padding
-        else:
-            self.padding = self._same_padding
-        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
-        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
-        self.drop = nn.Dropout(p_dropout)
-    def forward(self, x, x_mask):
-        x = self.conv_1(self.padding(x * x_mask))
-        if self.activation == "gelu":
-            x = x * torch.sigmoid(1.702 * x)
-        else:
-            x = torch.relu(x)
-        x = self.drop(x)
-        x = self.conv_2(self.padding(x * x_mask))
-        return x * x_mask
-    def _causal_padding(self, x):
-        if self.kernel_size == 1:
-            return x
-        pad_l = self.kernel_size - 1
-        pad_r = 0
-        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
-        x = F.pad(x, commons.convert_pad_shape(padding))
-        return x
-    def _same_padding(self, x):
-        if self.kernel_size == 1:
-            return x
-        pad_l = (self.kernel_size - 1) // 2
-        pad_r = self.kernel_size // 2
-        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
-        x = F.pad(x, commons.convert_pad_shape(padding))
-        return x

onnx_modules/V200/models_onnx.py DELETED Viewed

@@ -1,990 +0,0 @@
-import math
-import torch
-from torch import nn
-from torch.nn import functional as F
-import commons
-import modules
-from . import attentions_onnx
-from torch.nn import Conv1d, ConvTranspose1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from commons import init_weights, get_padding
-from .text import symbols, num_tones, num_languages
-class DurationDiscriminator(nn.Module):  # vits2
-    def __init__(
-        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.gin_channels = gin_channels
-        self.drop = nn.Dropout(p_dropout)
-        self.conv_1 = nn.Conv1d(
-            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.norm_1 = modules.LayerNorm(filter_channels)
-        self.conv_2 = nn.Conv1d(
-            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.norm_2 = modules.LayerNorm(filter_channels)
-        self.dur_proj = nn.Conv1d(1, filter_channels, 1)
-        self.pre_out_conv_1 = nn.Conv1d(
-            2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
-        self.pre_out_conv_2 = nn.Conv1d(
-            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
-        self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid())
-    def forward_probability(self, x, x_mask, dur, g=None):
-        dur = self.dur_proj(dur)
-        x = torch.cat([x, dur], dim=1)
-        x = self.pre_out_conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.pre_out_norm_1(x)
-        x = self.drop(x)
-        x = self.pre_out_conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.pre_out_norm_2(x)
-        x = self.drop(x)
-        x = x * x_mask
-        x = x.transpose(1, 2)
-        output_prob = self.output_layer(x)
-        return output_prob
-    def forward(self, x, x_mask, dur_r, dur_hat, g=None):
-        x = torch.detach(x)
-        if g is not None:
-            g = torch.detach(g)
-            x = x + self.cond(g)
-        x = self.conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_1(x)
-        x = self.drop(x)
-        x = self.conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_2(x)
-        x = self.drop(x)
-        output_probs = []
-        for dur in [dur_r, dur_hat]:
-            output_prob = self.forward_probability(x, x_mask, dur, g)
-            output_probs.append(output_prob)
-        return output_probs
-class TransformerCouplingBlock(nn.Module):
-    def __init__(
-        self,
-        channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        n_flows=4,
-        gin_channels=0,
-        share_parameter=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-        self.flows = nn.ModuleList()
-        self.wn = (
-            attentions_onnx.FFT(
-                hidden_channels,
-                filter_channels,
-                n_heads,
-                n_layers,
-                kernel_size,
-                p_dropout,
-                isflow=True,
-                gin_channels=self.gin_channels,
-            )
-            if share_parameter
-            else None
-        )
-        for i in range(n_flows):
-            self.flows.append(
-                modules.TransformerCouplingLayer(
-                    channels,
-                    hidden_channels,
-                    kernel_size,
-                    n_layers,
-                    n_heads,
-                    p_dropout,
-                    filter_channels,
-                    mean_only=True,
-                    wn_sharing_parameter=self.wn,
-                    gin_channels=self.gin_channels,
-                )
-            )
-            self.flows.append(modules.Flip())
-    def forward(self, x, x_mask, g=None, reverse=True):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x
-class StochasticDurationPredictor(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        filter_channels,
-        kernel_size,
-        p_dropout,
-        n_flows=4,
-        gin_channels=0,
-    ):
-        super().__init__()
-        filter_channels = in_channels  # it needs to be removed from future version.
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-        self.log_flow = modules.Log()
-        self.flows = nn.ModuleList()
-        self.flows.append(modules.ElementwiseAffine(2))
-        for i in range(n_flows):
-            self.flows.append(
-                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
-            )
-            self.flows.append(modules.Flip())
-        self.post_pre = nn.Conv1d(1, filter_channels, 1)
-        self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.post_convs = modules.DDSConv(
-            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
-        )
-        self.post_flows = nn.ModuleList()
-        self.post_flows.append(modules.ElementwiseAffine(2))
-        for i in range(4):
-            self.post_flows.append(
-                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
-            )
-            self.post_flows.append(modules.Flip())
-        self.pre = nn.Conv1d(in_channels, filter_channels, 1)
-        self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.convs = modules.DDSConv(
-            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
-        )
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
-    def forward(self, x, x_mask, z, g=None):
-        x = torch.detach(x)
-        x = self.pre(x)
-        if g is not None:
-            g = torch.detach(g)
-            x = x + self.cond(g)
-        x = self.convs(x, x_mask)
-        x = self.proj(x) * x_mask
-        flows = list(reversed(self.flows))
-        flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
-        for flow in flows:
-            z = flow(z, x_mask, g=x, reverse=True)
-        z0, z1 = torch.split(z, [1, 1], 1)
-        logw = z0
-        return logw
-class DurationPredictor(nn.Module):
-    def __init__(
-        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.gin_channels = gin_channels
-        self.drop = nn.Dropout(p_dropout)
-        self.conv_1 = nn.Conv1d(
-            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.norm_1 = modules.LayerNorm(filter_channels)
-        self.conv_2 = nn.Conv1d(
-            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.norm_2 = modules.LayerNorm(filter_channels)
-        self.proj = nn.Conv1d(filter_channels, 1, 1)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
-    def forward(self, x, x_mask, g=None):
-        x = torch.detach(x)
-        if g is not None:
-            g = torch.detach(g)
-            x = x + self.cond(g)
-        x = self.conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_1(x)
-        x = self.drop(x)
-        x = self.conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_2(x)
-        x = self.drop(x)
-        x = self.proj(x * x_mask)
-        return x * x_mask
-class TextEncoder(nn.Module):
-    def __init__(
-        self,
-        n_vocab,
-        out_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.n_vocab = n_vocab
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.gin_channels = gin_channels
-        self.emb = nn.Embedding(len(symbols), hidden_channels)
-        nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
-        self.tone_emb = nn.Embedding(num_tones, hidden_channels)
-        nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels**-0.5)
-        self.language_emb = nn.Embedding(num_languages, hidden_channels)
-        nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5)
-        self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
-        self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
-        self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
-        self.encoder = attentions_onnx.Encoder(
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout,
-            gin_channels=self.gin_channels,
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    def forward(self, x, x_lengths, tone, language, bert, ja_bert, en_bert, g=None):
-        x_mask = torch.ones_like(x).unsqueeze(0)
-        bert_emb = self.bert_proj(bert.transpose(0, 1).unsqueeze(0)).transpose(1, 2)
-        ja_bert_emb = self.ja_bert_proj(ja_bert.transpose(0, 1).unsqueeze(0)).transpose(
-            1, 2
-        )
-        en_bert_emb = self.en_bert_proj(en_bert.transpose(0, 1).unsqueeze(0)).transpose(
-            1, 2
-        )
-        x = (
-            self.emb(x)
-            + self.tone_emb(tone)
-            + self.language_emb(language)
-            + bert_emb
-            + ja_bert_emb
-            + en_bert_emb
-        ) * math.sqrt(
-            self.hidden_channels
-        )  # [b, t, h]
-        x = torch.transpose(x, 1, -1)  # [b, h, t]
-        x_mask = x_mask.to(x.dtype)
-        x = self.encoder(x * x_mask, x_mask, g=g)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        return x, m, logs, x_mask
-class ResidualCouplingBlock(nn.Module):
-    def __init__(
-        self,
-        channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        n_flows=4,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-        self.flows = nn.ModuleList()
-        for i in range(n_flows):
-            self.flows.append(
-                modules.ResidualCouplingLayer(
-                    channels,
-                    hidden_channels,
-                    kernel_size,
-                    dilation_rate,
-                    n_layers,
-                    gin_channels=gin_channels,
-                    mean_only=True,
-                )
-            )
-            self.flows.append(modules.Flip())
-    def forward(self, x, x_mask, g=None, reverse=True):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x
-class PosteriorEncoder(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.gin_channels = gin_channels
-        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-        self.enc = modules.WN(
-            hidden_channels,
-            kernel_size,
-            dilation_rate,
-            n_layers,
-            gin_channels=gin_channels,
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    def forward(self, x, x_lengths, g=None):
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
-            x.dtype
-        )
-        x = self.pre(x) * x_mask
-        x = self.enc(x, x_mask, g=g)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
-        return z, m, logs, x_mask
-class Generator(torch.nn.Module):
-    def __init__(
-        self,
-        initial_channel,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        gin_channels=0,
-    ):
-        super(Generator, self).__init__()
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-        self.conv_pre = Conv1d(
-            initial_channel, upsample_initial_channel, 7, 1, padding=3
-        )
-        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(
-                weight_norm(
-                    ConvTranspose1d(
-                        upsample_initial_channel // (2**i),
-                        upsample_initial_channel // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(
-                zip(resblock_kernel_sizes, resblock_dilation_sizes)
-            ):
-                self.resblocks.append(resblock(ch, k, d))
-        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-        self.ups.apply(init_weights)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-    def forward(self, x, g=None):
-        x = self.conv_pre(x)
-        if g is not None:
-            x = x + self.cond(g)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            x = self.ups[i](x)
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-    def remove_weight_norm(self):
-        print("Removing weight norm...")
-        for layer in self.ups:
-            remove_weight_norm(layer)
-        for layer in self.resblocks:
-            layer.remove_weight_norm()
-class DiscriminatorP(torch.nn.Module):
-    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
-        self.period = period
-        self.use_spectral_norm = use_spectral_norm
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(
-                    Conv2d(
-                        1,
-                        32,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        32,
-                        128,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        128,
-                        512,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        512,
-                        1024,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        1024,
-                        1024,
-                        (kernel_size, 1),
-                        1,
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-            ]
-        )
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-    def forward(self, x):
-        fmap = []
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), "reflect")
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-class DiscriminatorS(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super(DiscriminatorS, self).__init__()
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
-                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
-                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
-                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
-                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
-                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-            ]
-        )
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-    def forward(self, x):
-        fmap = []
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-class MultiPeriodDiscriminator(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super(MultiPeriodDiscriminator, self).__init__()
-        periods = [2, 3, 5, 7, 11]
-        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
-        discs = discs + [
-            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
-        ]
-        self.discriminators = nn.ModuleList(discs)
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            y_d_gs.append(y_d_g)
-            fmap_rs.append(fmap_r)
-            fmap_gs.append(fmap_g)
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-class ReferenceEncoder(nn.Module):
-    """
-    inputs --- [N, Ty/r, n_mels*r]  mels
-    outputs --- [N, ref_enc_gru_size]
-    """
-    def __init__(self, spec_channels, gin_channels=0):
-        super().__init__()
-        self.spec_channels = spec_channels
-        ref_enc_filters = [32, 32, 64, 64, 128, 128]
-        K = len(ref_enc_filters)
-        filters = [1] + ref_enc_filters
-        convs = [
-            weight_norm(
-                nn.Conv2d(
-                    in_channels=filters[i],
-                    out_channels=filters[i + 1],
-                    kernel_size=(3, 3),
-                    stride=(2, 2),
-                    padding=(1, 1),
-                )
-            )
-            for i in range(K)
-        ]
-        self.convs = nn.ModuleList(convs)
-        # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501
-        out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
-        self.gru = nn.GRU(
-            input_size=ref_enc_filters[-1] * out_channels,
-            hidden_size=256 // 2,
-            batch_first=True,
-        )
-        self.proj = nn.Linear(128, gin_channels)
-    def forward(self, inputs, mask=None):
-        N = inputs.size(0)
-        out = inputs.view(N, 1, -1, self.spec_channels)  # [N, 1, Ty, n_freqs]
-        for conv in self.convs:
-            out = conv(out)
-            # out = wn(out)
-            out = F.relu(out)  # [N, 128, Ty//2^K, n_mels//2^K]
-        out = out.transpose(1, 2)  # [N, Ty//2^K, 128, n_mels//2^K]
-        T = out.size(1)
-        N = out.size(0)
-        out = out.contiguous().view(N, T, -1)  # [N, Ty//2^K, 128*n_mels//2^K]
-        self.gru.flatten_parameters()
-        memory, out = self.gru(out)  # out --- [1, N, 128]
-        return self.proj(out.squeeze(0))
-    def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
-        for i in range(n_convs):
-            L = (L - kernel_size + 2 * pad) // stride + 1
-        return L
-class SynthesizerTrn(nn.Module):
-    """
-    Synthesizer for Training
-    """
-    def __init__(
-        self,
-        n_vocab,
-        spec_channels,
-        segment_size,
-        inter_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        n_speakers=256,
-        gin_channels=256,
-        use_sdp=True,
-        n_flow_layer=4,
-        n_layers_trans_flow=4,
-        flow_share_parameter=False,
-        use_transformer_flow=True,
-        **kwargs,
-    ):
-        super().__init__()
-        self.n_vocab = n_vocab
-        self.spec_channels = spec_channels
-        self.inter_channels = inter_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.resblock = resblock
-        self.resblock_kernel_sizes = resblock_kernel_sizes
-        self.resblock_dilation_sizes = resblock_dilation_sizes
-        self.upsample_rates = upsample_rates
-        self.upsample_initial_channel = upsample_initial_channel
-        self.upsample_kernel_sizes = upsample_kernel_sizes
-        self.segment_size = segment_size
-        self.n_speakers = n_speakers
-        self.gin_channels = gin_channels
-        self.n_layers_trans_flow = n_layers_trans_flow
-        self.use_spk_conditioned_encoder = kwargs.get(
-            "use_spk_conditioned_encoder", True
-        )
-        self.use_sdp = use_sdp
-        self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
-        self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
-        self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
-        self.current_mas_noise_scale = self.mas_noise_scale_initial
-        if self.use_spk_conditioned_encoder and gin_channels > 0:
-            self.enc_gin_channels = gin_channels
-        self.enc_p = TextEncoder(
-            n_vocab,
-            inter_channels,
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout,
-            gin_channels=self.enc_gin_channels,
-        )
-        self.dec = Generator(
-            inter_channels,
-            resblock,
-            resblock_kernel_sizes,
-            resblock_dilation_sizes,
-            upsample_rates,
-            upsample_initial_channel,
-            upsample_kernel_sizes,
-            gin_channels=gin_channels,
-        )
-        self.enc_q = PosteriorEncoder(
-            spec_channels,
-            inter_channels,
-            hidden_channels,
-            5,
-            1,
-            16,
-            gin_channels=gin_channels,
-        )
-        if use_transformer_flow:
-            self.flow = TransformerCouplingBlock(
-                inter_channels,
-                hidden_channels,
-                filter_channels,
-                n_heads,
-                n_layers_trans_flow,
-                5,
-                p_dropout,
-                n_flow_layer,
-                gin_channels=gin_channels,
-                share_parameter=flow_share_parameter,
-            )
-        else:
-            self.flow = ResidualCouplingBlock(
-                inter_channels,
-                hidden_channels,
-                5,
-                1,
-                n_flow_layer,
-                gin_channels=gin_channels,
-            )
-        self.sdp = StochasticDurationPredictor(
-            hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
-        )
-        self.dp = DurationPredictor(
-            hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
-        )
-        if n_speakers >= 1:
-            self.emb_g = nn.Embedding(n_speakers, gin_channels)
-        else:
-            self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
-    def export_onnx(
-        self,
-        path,
-        max_len=None,
-        sdp_ratio=0,
-        y=None,
-    ):
-        noise_scale = 0.667
-        length_scale = 1
-        noise_scale_w = 0.8
-        x = (
-            torch.LongTensor(
-                [
-                    0,
-                    97,
-                    0,
-                    8,
-                    0,
-                    78,
-                    0,
-                    8,
-                    0,
-                    76,
-                    0,
-                    37,
-                    0,
-                    40,
-                    0,
-                    97,
-                    0,
-                    8,
-                    0,
-                    23,
-                    0,
-                    8,
-                    0,
-                    74,
-                    0,
-                    26,
-                    0,
-                    104,
-                    0,
-                ]
-            )
-            .unsqueeze(0)
-            .cpu()
-        )
-        tone = torch.zeros_like(x).cpu()
-        language = torch.zeros_like(x).cpu()
-        x_lengths = torch.LongTensor([x.shape[1]]).cpu()
-        sid = torch.LongTensor([0]).cpu()
-        bert = torch.randn(size=(x.shape[1], 1024)).cpu()
-        ja_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
-        en_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
-        if self.n_speakers > 0:
-            g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
-            torch.onnx.export(
-                self.emb_g,
-                (sid),
-                f"onnx/{path}/{path}_emb.onnx",
-                input_names=["sid"],
-                output_names=["g"],
-                verbose=True,
-            )
-        else:
-            g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
-        torch.onnx.export(
-            self.enc_p,
-            (x, x_lengths, tone, language, bert, ja_bert, en_bert, g),
-            f"onnx/{path}/{path}_enc_p.onnx",
-            input_names=[
-                "x",
-                "x_lengths",
-                "t",
-                "language",
-                "bert_0",
-                "bert_1",
-                "bert_2",
-                "g",
-            ],
-            output_names=["xout", "m_p", "logs_p", "x_mask"],
-            dynamic_axes={
-                "x": [0, 1],
-                "t": [0, 1],
-                "language": [0, 1],
-                "bert_0": [0],
-                "bert_1": [0],
-                "bert_2": [0],
-                "xout": [0, 2],
-                "m_p": [0, 2],
-                "logs_p": [0, 2],
-                "x_mask": [0, 2],
-            },
-            verbose=True,
-            opset_version=16,
-        )
-        x, m_p, logs_p, x_mask = self.enc_p(
-            x, x_lengths, tone, language, bert, ja_bert, en_bert, g=g
-        )
-        zinput = (
-            torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
-            * noise_scale_w
-        )
-        torch.onnx.export(
-            self.sdp,
-            (x, x_mask, zinput, g),
-            f"onnx/{path}/{path}_sdp.onnx",
-            input_names=["x", "x_mask", "zin", "g"],
-            output_names=["logw"],
-            dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "zin": [0, 2], "logw": [0, 2]},
-            verbose=True,
-        )
-        torch.onnx.export(
-            self.dp,
-            (x, x_mask, g),
-            f"onnx/{path}/{path}_dp.onnx",
-            input_names=["x", "x_mask", "g"],
-            output_names=["logw"],
-            dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "logw": [0, 2]},
-            verbose=True,
-        )
-        logw = self.sdp(x, x_mask, zinput, g=g) * (sdp_ratio) + self.dp(
-            x, x_mask, g=g
-        ) * (1 - sdp_ratio)
-        w = torch.exp(logw) * x_mask * length_scale
-        w_ceil = torch.ceil(w)
-        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
-        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
-            x_mask.dtype
-        )
-        attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
-        attn = commons.generate_path(w_ceil, attn_mask)
-        m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
-            1, 2
-        )  # [b, t', t], [b, t, d] -> [b, d, t']
-        logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
-            1, 2
-        )  # [b, t', t], [b, t, d] -> [b, d, t']
-        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
-        torch.onnx.export(
-            self.flow,
-            (z_p, y_mask, g),
-            f"onnx/{path}/{path}_flow.onnx",
-            input_names=["z_p", "y_mask", "g"],
-            output_names=["z"],
-            dynamic_axes={"z_p": [0, 2], "y_mask": [0, 2], "z": [0, 2]},
-            verbose=True,
-        )
-        z = self.flow(z_p, y_mask, g=g, reverse=True)
-        z_in = (z * y_mask)[:, :, :max_len]
-        torch.onnx.export(
-            self.dec,
-            (z_in, g),
-            f"onnx/{path}/{path}_dec.onnx",
-            input_names=["z_in", "g"],
-            output_names=["o"],
-            dynamic_axes={"z_in": [0, 2], "o": [0, 2]},
-            verbose=True,
-        )
-        o = self.dec((z * y_mask)[:, :, :max_len], g=g)

onnx_modules/V200/text/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .symbols import *

onnx_modules/V200/text/bert_utils.py DELETED Viewed

@@ -1,23 +0,0 @@
-from pathlib import Path
-from huggingface_hub import hf_hub_download
-from config import config
-MIRROR: str = config.mirror
-def _check_bert(repo_id, files, local_path):
-    for file in files:
-        if not Path(local_path).joinpath(file).exists():
-            if MIRROR.lower() == "openi":
-                import openi
-                openi.model.download_model(
-                    "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
-                )
-            else:
-                hf_hub_download(
-                    repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
-                )

onnx_modules/V200/text/chinese.py DELETED Viewed

@@ -1,198 +0,0 @@
-import os
-import re
-import cn2an
-from pypinyin import lazy_pinyin, Style
-from .symbols import punctuation
-from .tone_sandhi import ToneSandhi
-current_file_path = os.path.dirname(__file__)
-pinyin_to_symbol_map = {
-    line.split("\t")[0]: line.strip().split("\t")[1]
-    for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
-}
-import jieba.posseg as psg
-rep_map = {
-    "：": ",",
-    "；": ",",
-    "，": ",",
-    "。": ".",
-    "！": "!",
-    "？": "?",
-    "\n": ".",
-    "·": ",",
-    "、": ",",
-    "...": "…",
-    "$": ".",
-    "“": "'",
-    "”": "'",
-    "‘": "'",
-    "’": "'",
-    "（": "'",
-    "）": "'",
-    "(": "'",
-    ")": "'",
-    "《": "'",
-    "》": "'",
-    "【": "'",
-    "】": "'",
-    "[": "'",
-    "]": "'",
-    "—": "-",
-    "～": "-",
-    "~": "-",
-    "「": "'",
-    "」": "'",
-}
-tone_modifier = ToneSandhi()
-def replace_punctuation(text):
-    text = text.replace("嗯", "恩").replace("呣", "母")
-    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
-    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
-    replaced_text = re.sub(
-        r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
-    )
-    return replaced_text
-def g2p(text):
-    pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
-    sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
-    phones, tones, word2ph = _g2p(sentences)
-    assert sum(word2ph) == len(phones)
-    assert len(word2ph) == len(text)  # Sometimes it will crash,you can add a try-catch.
-    phones = ["_"] + phones + ["_"]
-    tones = [0] + tones + [0]
-    word2ph = [1] + word2ph + [1]
-    return phones, tones, word2ph
-def _get_initials_finals(word):
-    initials = []
-    finals = []
-    orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
-    orig_finals = lazy_pinyin(
-        word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
-    )
-    for c, v in zip(orig_initials, orig_finals):
-        initials.append(c)
-        finals.append(v)
-    return initials, finals
-def _g2p(segments):
-    phones_list = []
-    tones_list = []
-    word2ph = []
-    for seg in segments:
-        # Replace all English words in the sentence
-        seg = re.sub("[a-zA-Z]+", "", seg)
-        seg_cut = psg.lcut(seg)
-        initials = []
-        finals = []
-        seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
-        for word, pos in seg_cut:
-            if pos == "eng":
-                continue
-            sub_initials, sub_finals = _get_initials_finals(word)
-            sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
-            initials.append(sub_initials)
-            finals.append(sub_finals)
-            # assert len(sub_initials) == len(sub_finals) == len(word)
-        initials = sum(initials, [])
-        finals = sum(finals, [])
-        #
-        for c, v in zip(initials, finals):
-            raw_pinyin = c + v
-            # NOTE: post process for pypinyin outputs
-            # we discriminate i, ii and iii
-            if c == v:
-                assert c in punctuation
-                phone = [c]
-                tone = "0"
-                word2ph.append(1)
-            else:
-                v_without_tone = v[:-1]
-                tone = v[-1]
-                pinyin = c + v_without_tone
-                assert tone in "12345"
-                if c:
-                    # 多音节
-                    v_rep_map = {
-                        "uei": "ui",
-                        "iou": "iu",
-                        "uen": "un",
-                    }
-                    if v_without_tone in v_rep_map.keys():
-                        pinyin = c + v_rep_map[v_without_tone]
-                else:
-                    # 单音节
-                    pinyin_rep_map = {
-                        "ing": "ying",
-                        "i": "yi",
-                        "in": "yin",
-                        "u": "wu",
-                    }
-                    if pinyin in pinyin_rep_map.keys():
-                        pinyin = pinyin_rep_map[pinyin]
-                    else:
-                        single_rep_map = {
-                            "v": "yu",
-                            "e": "e",
-                            "i": "y",
-                            "u": "w",
-                        }
-                        if pinyin[0] in single_rep_map.keys():
-                            pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
-                assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
-                phone = pinyin_to_symbol_map[pinyin].split(" ")
-                word2ph.append(len(phone))
-            phones_list += phone
-            tones_list += [int(tone)] * len(phone)
-    return phones_list, tones_list, word2ph
-def text_normalize(text):
-    numbers = re.findall(r"\d+(?:\.?\d+)?", text)
-    for number in numbers:
-        text = text.replace(number, cn2an.an2cn(number), 1)
-    text = replace_punctuation(text)
-    return text
-def get_bert_feature(text, word2ph):
-    from text import chinese_bert
-    return chinese_bert.get_bert_feature(text, word2ph)
-if __name__ == "__main__":
-    from text.chinese_bert import get_bert_feature
-    text = "啊！但是《原神》是由,米哈\游自主，  [研发]的一款全.新开放世界.冒险游戏"
-    text = text_normalize(text)
-    print(text)
-    phones, tones, word2ph = g2p(text)
-    bert = get_bert_feature(text, word2ph)
-    print(phones, tones, word2ph, bert.shape)
-# # 示例用法
-# text = "这是一个示例文本：,你好！这是一个测试...."
-# print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试

onnx_modules/V200/text/chinese_bert.py DELETED Viewed

@@ -1,101 +0,0 @@
-import sys
-import torch
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-from config import config
-LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
-tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
-models = dict()
-def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
-    if (
-        sys.platform == "darwin"
-        and torch.backends.mps.is_available()
-        and device == "cpu"
-    ):
-        device = "mps"
-    if not device:
-        device = "cuda"
-    if device not in models.keys():
-        models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
-    with torch.no_grad():
-        inputs = tokenizer(text, return_tensors="pt")
-        for i in inputs:
-            inputs[i] = inputs[i].to(device)
-        res = models[device](**inputs, output_hidden_states=True)
-        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
-    assert len(word2ph) == len(text) + 2
-    word2phone = word2ph
-    phone_level_feature = []
-    for i in range(len(word2phone)):
-        repeat_feature = res[i].repeat(word2phone[i], 1)
-        phone_level_feature.append(repeat_feature)
-    phone_level_feature = torch.cat(phone_level_feature, dim=0)
-    return phone_level_feature.T
-if __name__ == "__main__":
-    word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
-    word2phone = [
-        1,
-        2,
-        1,
-        2,
-        2,
-        1,
-        2,
-        2,
-        1,
-        2,
-        2,
-        1,
-        2,
-        2,
-        2,
-        2,
-        2,
-        1,
-        1,
-        2,
-        2,
-        1,
-        2,
-        2,
-        2,
-        2,
-        1,
-        2,
-        2,
-        2,
-        2,
-        2,
-        1,
-        2,
-        2,
-        2,
-        2,
-        1,
-    ]
-    # 计算总帧数
-    total_frames = sum(word2phone)
-    print(word_level_feature.shape)
-    print(word2phone)
-    phone_level_feature = []
-    for i in range(len(word2phone)):
-        print(word_level_feature[i].shape)
-        # 对每个词重复word2phone[i]次
-        repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
-        phone_level_feature.append(repeat_feature)
-    phone_level_feature = torch.cat(phone_level_feature, dim=0)
-    print(phone_level_feature.shape)  # torch.Size([36, 1024])

onnx_modules/V200/text/cleaner.py DELETED Viewed

@@ -1,28 +0,0 @@
-from . import chinese, japanese, english, cleaned_text_to_sequence
-language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
-def clean_text(text, language):
-    language_module = language_module_map[language]
-    norm_text = language_module.text_normalize(text)
-    phones, tones, word2ph = language_module.g2p(norm_text)
-    return norm_text, phones, tones, word2ph
-def clean_text_bert(text, language):
-    language_module = language_module_map[language]
-    norm_text = language_module.text_normalize(text)
-    phones, tones, word2ph = language_module.g2p(norm_text)
-    bert = language_module.get_bert_feature(norm_text, word2ph)
-    return phones, tones, bert
-def text_to_sequence(text, language):
-    norm_text, phones, tones, word2ph = clean_text(text, language)
-    return cleaned_text_to_sequence(phones, tones, language)
-if __name__ == "__main__":
-    pass

onnx_modules/V200/text/english.py DELETED Viewed

@@ -1,362 +0,0 @@
-import pickle
-import os
-import re
-from g2p_en import G2p
-from . import symbols
-current_file_path = os.path.dirname(__file__)
-CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
-CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
-_g2p = G2p()
-arpa = {
-    "AH0",
-    "S",
-    "AH1",
-    "EY2",
-    "AE2",
-    "EH0",
-    "OW2",
-    "UH0",
-    "NG",
-    "B",
-    "G",
-    "AY0",
-    "M",
-    "AA0",
-    "F",
-    "AO0",
-    "ER2",
-    "UH1",
-    "IY1",
-    "AH2",
-    "DH",
-    "IY0",
-    "EY1",
-    "IH0",
-    "K",
-    "N",
-    "W",
-    "IY2",
-    "T",
-    "AA1",
-    "ER1",
-    "EH2",
-    "OY0",
-    "UH2",
-    "UW1",
-    "Z",
-    "AW2",
-    "AW1",
-    "V",
-    "UW2",
-    "AA2",
-    "ER",
-    "AW0",
-    "UW0",
-    "R",
-    "OW1",
-    "EH1",
-    "ZH",
-    "AE0",
-    "IH2",
-    "IH",
-    "Y",
-    "JH",
-    "P",
-    "AY1",
-    "EY0",
-    "OY2",
-    "TH",
-    "HH",
-    "D",
-    "ER0",
-    "CH",
-    "AO1",
-    "AE1",
-    "AO2",
-    "OY1",
-    "AY2",
-    "IH1",
-    "OW0",
-    "L",
-    "SH",
-}
-def post_replace_ph(ph):
-    rep_map = {
-        "：": ",",
-        "；": ",",
-        "，": ",",
-        "。": ".",
-        "！": "!",
-        "？": "?",
-        "\n": ".",
-        "·": ",",
-        "、": ",",
-        "...": "…",
-        "v": "V",
-    }
-    if ph in rep_map.keys():
-        ph = rep_map[ph]
-    if ph in symbols:
-        return ph
-    if ph not in symbols:
-        ph = "UNK"
-    return ph
-def read_dict():
-    g2p_dict = {}
-    start_line = 49
-    with open(CMU_DICT_PATH) as f:
-        line = f.readline()
-        line_index = 1
-        while line:
-            if line_index >= start_line:
-                line = line.strip()
-                word_split = line.split("  ")
-                word = word_split[0]
-                syllable_split = word_split[1].split(" - ")
-                g2p_dict[word] = []
-                for syllable in syllable_split:
-                    phone_split = syllable.split(" ")
-                    g2p_dict[word].append(phone_split)
-            line_index = line_index + 1
-            line = f.readline()
-    return g2p_dict
-def cache_dict(g2p_dict, file_path):
-    with open(file_path, "wb") as pickle_file:
-        pickle.dump(g2p_dict, pickle_file)
-def get_dict():
-    if os.path.exists(CACHE_PATH):
-        with open(CACHE_PATH, "rb") as pickle_file:
-            g2p_dict = pickle.load(pickle_file)
-    else:
-        g2p_dict = read_dict()
-        cache_dict(g2p_dict, CACHE_PATH)
-    return g2p_dict
-eng_dict = get_dict()
-def refine_ph(phn):
-    tone = 0
-    if re.search(r"\d$", phn):
-        tone = int(phn[-1]) + 1
-        phn = phn[:-1]
-    return phn.lower(), tone
-def refine_syllables(syllables):
-    tones = []
-    phonemes = []
-    for phn_list in syllables:
-        for i in range(len(phn_list)):
-            phn = phn_list[i]
-            phn, tone = refine_ph(phn)
-            phonemes.append(phn)
-            tones.append(tone)
-    return phonemes, tones
-import re
-import inflect
-_inflect = inflect.engine()
-_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
-_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
-_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
-_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
-_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
-_number_re = re.compile(r"[0-9]+")
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [
-    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-    for x in [
-        ("mrs", "misess"),
-        ("mr", "mister"),
-        ("dr", "doctor"),
-        ("st", "saint"),
-        ("co", "company"),
-        ("jr", "junior"),
-        ("maj", "major"),
-        ("gen", "general"),
-        ("drs", "doctors"),
-        ("rev", "reverend"),
-        ("lt", "lieutenant"),
-        ("hon", "honorable"),
-        ("sgt", "sergeant"),
-        ("capt", "captain"),
-        ("esq", "esquire"),
-        ("ltd", "limited"),
-        ("col", "colonel"),
-        ("ft", "fort"),
-    ]
-]
-# List of (ipa, lazy ipa) pairs:
-_lazy_ipa = [
-    (re.compile("%s" % x[0]), x[1])
-    for x in [
-        ("r", "ɹ"),
-        ("æ", "e"),
-        ("ɑ", "a"),
-        ("ɔ", "o"),
-        ("ð", "z"),
-        ("θ", "s"),
-        ("ɛ", "e"),
-        ("ɪ", "i"),
-        ("ʊ", "u"),
-        ("ʒ", "ʥ"),
-        ("ʤ", "ʥ"),
-        ("ˈ", "↓"),
-    ]
-]
-# List of (ipa, lazy ipa2) pairs:
-_lazy_ipa2 = [
-    (re.compile("%s" % x[0]), x[1])
-    for x in [
-        ("r", "ɹ"),
-        ("ð", "z"),
-        ("θ", "s"),
-        ("ʒ", "ʑ"),
-        ("ʤ", "dʑ"),
-        ("ˈ", "↓"),
-    ]
-]
-# List of (ipa, ipa2) pairs
-_ipa_to_ipa2 = [
-    (re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
-]
-def _expand_dollars(m):
-    match = m.group(1)
-    parts = match.split(".")
-    if len(parts) > 2:
-        return match + " dollars"  # Unexpected format
-    dollars = int(parts[0]) if parts[0] else 0
-    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
-    if dollars and cents:
-        dollar_unit = "dollar" if dollars == 1 else "dollars"
-        cent_unit = "cent" if cents == 1 else "cents"
-        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
-    elif dollars:
-        dollar_unit = "dollar" if dollars == 1 else "dollars"
-        return "%s %s" % (dollars, dollar_unit)
-    elif cents:
-        cent_unit = "cent" if cents == 1 else "cents"
-        return "%s %s" % (cents, cent_unit)
-    else:
-        return "zero dollars"
-def _remove_commas(m):
-    return m.group(1).replace(",", "")
-def _expand_ordinal(m):
-    return _inflect.number_to_words(m.group(0))
-def _expand_number(m):
-    num = int(m.group(0))
-    if num > 1000 and num < 3000:
-        if num == 2000:
-            return "two thousand"
-        elif num > 2000 and num < 2010:
-            return "two thousand " + _inflect.number_to_words(num % 100)
-        elif num % 100 == 0:
-            return _inflect.number_to_words(num // 100) + " hundred"
-        else:
-            return _inflect.number_to_words(
-                num, andword="", zero="oh", group=2
-            ).replace(", ", " ")
-    else:
-        return _inflect.number_to_words(num, andword="")
-def _expand_decimal_point(m):
-    return m.group(1).replace(".", " point ")
-def normalize_numbers(text):
-    text = re.sub(_comma_number_re, _remove_commas, text)
-    text = re.sub(_pounds_re, r"\1 pounds", text)
-    text = re.sub(_dollars_re, _expand_dollars, text)
-    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
-    text = re.sub(_ordinal_re, _expand_ordinal, text)
-    text = re.sub(_number_re, _expand_number, text)
-    return text
-def text_normalize(text):
-    text = normalize_numbers(text)
-    return text
-def g2p(text):
-    phones = []
-    tones = []
-    word2ph = []
-    words = re.split(r"([,;.\-\?\!\s+])", text)
-    words = [word for word in words if word.strip() != ""]
-    for word in words:
-        if word.upper() in eng_dict:
-            phns, tns = refine_syllables(eng_dict[word.upper()])
-            phones += phns
-            tones += tns
-            word2ph.append(len(phns))
-        else:
-            phone_list = list(filter(lambda p: p != " ", _g2p(word)))
-            for ph in phone_list:
-                if ph in arpa:
-                    ph, tn = refine_ph(ph)
-                    phones.append(ph)
-                    tones.append(tn)
-                else:
-                    phones.append(ph)
-                    tones.append(0)
-            word2ph.append(len(phone_list))
-    phones = [post_replace_ph(i) for i in phones]
-    phones = ["_"] + phones + ["_"]
-    tones = [0] + tones + [0]
-    word2ph = [1] + word2ph + [1]
-    return phones, tones, word2ph
-def get_bert_feature(text, word2ph):
-    from text import english_bert_mock
-    return english_bert_mock.get_bert_feature(text, word2ph)
-if __name__ == "__main__":
-    # print(get_dict())
-    # print(eng_word_to_phoneme("hello"))
-    print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
-    # all_phones = set()
-    # for k, syllables in eng_dict.items():
-    #     for group in syllables:
-    #         for ph in group:
-    #             all_phones.add(ph)
-    # print(all_phones)

onnx_modules/V200/text/english_bert_mock.py DELETED Viewed

@@ -1,42 +0,0 @@
-import sys
-import torch
-from transformers import DebertaV2Model, DebertaV2Tokenizer
-from config import config
-LOCAL_PATH = "./bert/deberta-v3-large"
-tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
-models = dict()
-def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
-    if (
-        sys.platform == "darwin"
-        and torch.backends.mps.is_available()
-        and device == "cpu"
-    ):
-        device = "mps"
-    if not device:
-        device = "cuda"
-    if device not in models.keys():
-        models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
-    with torch.no_grad():
-        inputs = tokenizer(text, return_tensors="pt")
-        for i in inputs:
-            inputs[i] = inputs[i].to(device)
-        res = models[device](**inputs, output_hidden_states=True)
-        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
-    # assert len(word2ph) == len(text)+2
-    word2phone = word2ph
-    phone_level_feature = []
-    for i in range(len(word2phone)):
-        repeat_feature = res[i].repeat(word2phone[i], 1)
-        phone_level_feature.append(repeat_feature)
-    phone_level_feature = torch.cat(phone_level_feature, dim=0)
-    return phone_level_feature.T

onnx_modules/V200/text/japanese.py DELETED Viewed

@@ -1,403 +0,0 @@
-# Convert Japanese text to phonemes which is
-# compatible with Julius https://github.com/julius-speech/segmentation-kit
-import re
-import unicodedata
-from transformers import AutoTokenizer
-from . import punctuation, symbols
-from num2words import num2words
-import pyopenjtalk
-import jaconv
-def kata2phoneme(text: str) -> str:
-    """Convert katakana text to phonemes."""
-    text = text.strip()
-    if text == "ー":
-        return ["ー"]
-    elif text.startswith("ー"):
-        return ["ー"] + kata2phoneme(text[1:])
-    res = []
-    prev = None
-    while text:
-        if re.match(_MARKS, text):
-            res.append(text)
-            text = text[1:]
-            continue
-        if text.startswith("ー"):
-            if prev:
-                res.append(prev[-1])
-            text = text[1:]
-            continue
-        res += pyopenjtalk.g2p(text).lower().replace("cl", "q").split(" ")
-        break
-    # res = _COLON_RX.sub(":", res)
-    return res
-def hira2kata(text: str) -> str:
-    return jaconv.hira2kata(text)
-_SYMBOL_TOKENS = set(list("・、。？！"))
-_NO_YOMI_TOKENS = set(list("「」『』―（）［］[]"))
-_MARKS = re.compile(
-    r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
-)
-def text2kata(text: str) -> str:
-    parsed = pyopenjtalk.run_frontend(text)
-    res = []
-    for parts in parsed:
-        word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
-            "’", ""
-        )
-        if yomi:
-            if re.match(_MARKS, yomi):
-                if len(word) > 1:
-                    word = [replace_punctuation(i) for i in list(word)]
-                    yomi = word
-                    res += yomi
-                    sep += word
-                    continue
-                elif word not in rep_map.keys() and word not in rep_map.values():
-                    word = ","
-                yomi = word
-            res.append(yomi)
-        else:
-            if word in _SYMBOL_TOKENS:
-                res.append(word)
-            elif word in ("っ", "ッ"):
-                res.append("ッ")
-            elif word in _NO_YOMI_TOKENS:
-                pass
-            else:
-                res.append(word)
-    return hira2kata("".join(res))
-def text2sep_kata(text: str) -> (list, list):
-    parsed = pyopenjtalk.run_frontend(text)
-    res = []
-    sep = []
-    for parts in parsed:
-        word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
-            "’", ""
-        )
-        if yomi:
-            if re.match(_MARKS, yomi):
-                if len(word) > 1:
-                    word = [replace_punctuation(i) for i in list(word)]
-                    yomi = word
-                    res += yomi
-                    sep += word
-                    continue
-                elif word not in rep_map.keys() and word not in rep_map.values():
-                    word = ","
-                yomi = word
-            res.append(yomi)
-        else:
-            if word in _SYMBOL_TOKENS:
-                res.append(word)
-            elif word in ("っ", "ッ"):
-                res.append("ッ")
-            elif word in _NO_YOMI_TOKENS:
-                pass
-            else:
-                res.append(word)
-        sep.append(word)
-    return sep, [hira2kata(i) for i in res], get_accent(parsed)
-def get_accent(parsed):
-    labels = pyopenjtalk.make_label(parsed)
-    phonemes = []
-    accents = []
-    for n, label in enumerate(labels):
-        phoneme = re.search(r"\-([^\+]*)\+", label).group(1)
-        if phoneme not in ["sil", "pau"]:
-            phonemes.append(phoneme.replace("cl", "q").lower())
-        else:
-            continue
-        a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
-        a2 = int(re.search(r"\+(\d+)\+", label).group(1))
-        if re.search(r"\-([^\+]*)\+", labels[n + 1]).group(1) in ["sil", "pau"]:
-            a2_next = -1
-        else:
-            a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
-        # Falling
-        if a1 == 0 and a2_next == a2 + 1:
-            accents.append(-1)
-        # Rising
-        elif a2 == 1 and a2_next == 2:
-            accents.append(1)
-        else:
-            accents.append(0)
-    return list(zip(phonemes, accents))
-_ALPHASYMBOL_YOMI = {
-    "#": "シャープ",
-    "%": "パーセント",
-    "&": "アンド",
-    "+": "プラス",
-    "-": "マイナス",
-    ":": "コロン",
-    ";": "セミコロン",
-    "<": "小なり",
-    "=": "イコール",
-    ">": "大なり",
-    "@": "アット",
-    "a": "エー",
-    "b": "ビー",
-    "c": "シー",
-    "d": "ディー",
-    "e": "イー",
-    "f": "エフ",
-    "g": "ジー",
-    "h": "エイチ",
-    "i": "アイ",
-    "j": "ジェー",
-    "k": "ケー",
-    "l": "エル",
-    "m": "エム",
-    "n": "エヌ",
-    "o": "オー",
-    "p": "ピー",
-    "q": "キュー",
-    "r": "アール",
-    "s": "エス",
-    "t": "ティー",
-    "u": "ユー",
-    "v": "ブイ",
-    "w": "ダブリュー",
-    "x": "エックス",
-    "y": "ワイ",
-    "z": "ゼット",
-    "α": "アルファ",
-    "β": "ベータ",
-    "γ": "ガンマ",
-    "δ": "デルタ",
-    "ε": "イプシロン",
-    "ζ": "ゼータ",
-    "η": "イータ",
-    "θ": "シータ",
-    "ι": "イオタ",
-    "κ": "カッパ",
-    "λ": "ラムダ",
-    "μ": "ミュー",
-    "ν": "ニュー",
-    "ξ": "クサイ",
-    "ο": "オミクロン",
-    "π": "パイ",
-    "ρ": "ロー",
-    "σ": "シグマ",
-    "τ": "タウ",
-    "υ": "ウプシロン",
-    "φ": "ファイ",
-    "χ": "カイ",
-    "ψ": "プサイ",
-    "ω": "オメガ",
-}
-_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
-_CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
-_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
-_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
-def japanese_convert_numbers_to_words(text: str) -> str:
-    res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
-    res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
-    res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
-    return res
-def japanese_convert_alpha_symbols_to_words(text: str) -> str:
-    return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
-def japanese_text_to_phonemes(text: str) -> str:
-    """Convert Japanese text to phonemes."""
-    res = unicodedata.normalize("NFKC", text)
-    res = japanese_convert_numbers_to_words(res)
-    # res = japanese_convert_alpha_symbols_to_words(res)
-    res = text2kata(res)
-    res = kata2phoneme(res)
-    return res
-def is_japanese_character(char):
-    # 定义日语文字系统的 Unicode 范围
-    japanese_ranges = [
-        (0x3040, 0x309F),  # 平假名
-        (0x30A0, 0x30FF),  # 片假名
-        (0x4E00, 0x9FFF),  # 汉字 (CJK Unified Ideographs)
-        (0x3400, 0x4DBF),  # 汉字扩展 A
-        (0x20000, 0x2A6DF),  # 汉字扩展 B
-        # 可以根据需要添加其他汉字扩展范围
-    ]
-    # 将字符的 Unicode 编码转换为整数
-    char_code = ord(char)
-    # 检查字符是否在任何一个日语范围内
-    for start, end in japanese_ranges:
-        if start <= char_code <= end:
-            return True
-    return False
-rep_map = {
-    "：": ",",
-    "；": ",",
-    "，": ",",
-    "。": ".",
-    "！": "!",
-    "？": "?",
-    "\n": ".",
-    "．": ".",
-    "...": "…",
-    "···": "…",
-    "・・・": "…",
-    "·": ",",
-    "・": ",",
-    "、": ",",
-    "$": ".",
-    "“": "'",
-    "”": "'",
-    "‘": "'",
-    "’": "'",
-    "（": "'",
-    "）": "'",
-    "(": "'",
-    ")": "'",
-    "《": "'",
-    "》": "'",
-    "【": "'",
-    "】": "'",
-    "[": "'",
-    "]": "'",
-    "—": "-",
-    "−": "-",
-    "～": "-",
-    "~": "-",
-    "「": "'",
-    "」": "'",
-}
-def replace_punctuation(text):
-    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
-    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
-    replaced_text = re.sub(
-        r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
-        + "".join(punctuation)
-        + r"]+",
-        "",
-        replaced_text,
-    )
-    return replaced_text
-def text_normalize(text):
-    res = unicodedata.normalize("NFKC", text)
-    res = japanese_convert_numbers_to_words(res)
-    # res = "".join([i for i in res if is_japanese_character(i)])
-    res = replace_punctuation(res)
-    return res
-def distribute_phone(n_phone, n_word):
-    phones_per_word = [0] * n_word
-    for task in range(n_phone):
-        min_tasks = min(phones_per_word)
-        min_index = phones_per_word.index(min_tasks)
-        phones_per_word[min_index] += 1
-    return phones_per_word
-def handle_long(sep_phonemes):
-    for i in range(len(sep_phonemes)):
-        if sep_phonemes[i][0] == "ー":
-            sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
-        if "ー" in sep_phonemes[i]:
-            for j in range(len(sep_phonemes[i])):
-                if sep_phonemes[i][j] == "ー":
-                    sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
-    return sep_phonemes
-tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese")
-def align_tones(phones, tones):
-    res = []
-    for pho in phones:
-        temp = [0] * len(pho)
-        for idx, p in enumerate(pho):
-            if len(tones) == 0:
-                break
-            if p == tones[0][0]:
-                temp[idx] = tones[0][1]
-                if idx > 0:
-                    temp[idx] += temp[idx - 1]
-                tones.pop(0)
-        temp = [0] + temp
-        temp = temp[:-1]
-        if -1 in temp:
-            temp = [i + 1 for i in temp]
-        res.append(temp)
-    res = [i for j in res for i in j]
-    assert not any([i < 0 for i in res]) and not any([i > 1 for i in res])
-    return res
-def g2p(norm_text):
-    sep_text, sep_kata, acc = text2sep_kata(norm_text)
-    sep_tokenized = [tokenizer.tokenize(i) for i in sep_text]
-    sep_phonemes = handle_long([kata2phoneme(i) for i in sep_kata])
-    # 异常处理，MeCab不认识的词的话会一路传到这里来，然后炸掉。目前来看只有那些超级稀有的生僻词会出现这种情况
-    for i in sep_phonemes:
-        for j in i:
-            assert j in symbols, (sep_text, sep_kata, sep_phonemes)
-    tones = align_tones(sep_phonemes, acc)
-    word2ph = []
-    for token, phoneme in zip(sep_tokenized, sep_phonemes):
-        phone_len = len(phoneme)
-        word_len = len(token)
-        aaa = distribute_phone(phone_len, word_len)
-        word2ph += aaa
-    phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"]
-    tones = [0] + tones + [0]
-    word2ph = [1] + word2ph + [1]
-    assert len(phones) == len(tones)
-    return phones, tones, word2ph
-if __name__ == "__main__":
-    tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese")
-    text = "hello,こんにちは、世界ー！……"
-    from text.japanese_bert import get_bert_feature
-    text = text_normalize(text)
-    print(text)
-    phones, tones, word2ph = g2p(text)
-    bert = get_bert_feature(text, word2ph)
-    print(phones, tones, word2ph, bert.shape)

onnx_modules/V200/text/japanese_bert.py DELETED Viewed

@@ -1,58 +0,0 @@
-import sys
-import torch
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-from config import config
-from .japanese import text2sep_kata
-LOCAL_PATH = "./bert/deberta-v2-large-japanese"
-tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
-models = dict()
-def get_bert_feature(text, word2ph, device=config.bert_gen_config.device):
-    sep_text, _, _ = text2sep_kata(text)
-    sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
-    sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
-    sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
-    return get_bert_feature_with_token(sep_ids, word2ph, device)
-def get_bert_feature_with_token(tokens, word2ph, device=config.bert_gen_config.device):
-    if (
-        sys.platform == "darwin"
-        and torch.backends.mps.is_available()
-        and device == "cpu"
-    ):
-        device = "mps"
-    if not device:
-        device = "cuda"
-    if device not in models.keys():
-        models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
-    with torch.no_grad():
-        inputs = torch.tensor(tokens).to(device).unsqueeze(0)
-        token_type_ids = torch.zeros_like(inputs).to(device)
-        attention_mask = torch.ones_like(inputs).to(device)
-        inputs = {
-            "input_ids": inputs,
-            "token_type_ids": token_type_ids,
-            "attention_mask": attention_mask,
-        }
-        # for i in inputs:
-        #     inputs[i] = inputs[i].to(device)
-        res = models[device](**inputs, output_hidden_states=True)
-        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
-    assert inputs["input_ids"].shape[-1] == len(word2ph)
-    word2phone = word2ph
-    phone_level_feature = []
-    for i in range(len(word2phone)):
-        repeat_feature = res[i].repeat(word2phone[i], 1)
-        phone_level_feature.append(repeat_feature)
-    phone_level_feature = torch.cat(phone_level_feature, dim=0)
-    return phone_level_feature.T

onnx_modules/V200/text/opencpop-strict.txt DELETED Viewed

@@ -1,429 +0,0 @@
-a	AA a
-ai	AA ai
-an	AA an
-ang	AA ang
-ao	AA ao
-ba	b a
-bai	b ai
-ban	b an
-bang	b ang
-bao	b ao
-bei	b ei
-ben	b en
-beng	b eng
-bi	b i
-bian	b ian
-biao	b iao
-bie	b ie
-bin	b in
-bing	b ing
-bo	b o
-bu	b u
-ca	c a
-cai	c ai
-can	c an
-cang	c ang
-cao	c ao
-ce	c e
-cei	c ei
-cen	c en
-ceng	c eng
-cha	ch a
-chai	ch ai
-chan	ch an
-chang	ch ang
-chao	ch ao
-che	ch e
-chen	ch en
-cheng	ch eng
-chi	ch ir
-chong	ch ong
-chou	ch ou
-chu	ch u
-chua	ch ua
-chuai	ch uai
-chuan	ch uan
-chuang	ch uang
-chui	ch ui
-chun	ch un
-chuo	ch uo
-ci	c i0
-cong	c ong
-cou	c ou
-cu	c u
-cuan	c uan
-cui	c ui
-cun	c un
-cuo	c uo
-da	d a
-dai	d ai
-dan	d an
-dang	d ang
-dao	d ao
-de	d e
-dei	d ei
-den	d en
-deng	d eng
-di	d i
-dia	d ia
-dian	d ian
-diao	d iao
-die	d ie
-ding	d ing
-diu	d iu
-dong	d ong
-dou	d ou
-du	d u
-duan	d uan
-dui	d ui
-dun	d un
-duo	d uo
-e	EE e
-ei	EE ei
-en	EE en
-eng	EE eng
-er	EE er
-fa	f a
-fan	f an
-fang	f ang
-fei	f ei
-fen	f en
-feng	f eng
-fo	f o
-fou	f ou
-fu	f u
-ga	g a
-gai	g ai
-gan	g an
-gang	g ang
-gao	g ao
-ge	g e
-gei	g ei
-gen	g en
-geng	g eng
-gong	g ong
-gou	g ou
-gu	g u
-gua	g ua
-guai	g uai
-guan	g uan
-guang	g uang
-gui	g ui
-gun	g un
-guo	g uo
-ha	h a
-hai	h ai
-han	h an
-hang	h ang
-hao	h ao
-he	h e
-hei	h ei
-hen	h en
-heng	h eng
-hong	h ong
-hou	h ou
-hu	h u
-hua	h ua
-huai	h uai
-huan	h uan
-huang	h uang
-hui	h ui
-hun	h un
-huo	h uo
-ji	j i
-jia	j ia
-jian	j ian
-jiang	j iang
-jiao	j iao
-jie	j ie
-jin	j in
-jing	j ing
-jiong	j iong
-jiu	j iu
-ju	j v
-jv	j v
-juan	j van
-jvan	j van
-jue	j ve
-jve	j ve
-jun	j vn
-jvn	j vn
-ka	k a
-kai	k ai
-kan	k an
-kang	k ang
-kao	k ao
-ke	k e
-kei	k ei
-ken	k en
-keng	k eng
-kong	k ong
-kou	k ou
-ku	k u
-kua	k ua
-kuai	k uai
-kuan	k uan
-kuang	k uang
-kui	k ui
-kun	k un
-kuo	k uo
-la	l a
-lai	l ai
-lan	l an
-lang	l ang
-lao	l ao
-le	l e
-lei	l ei
-leng	l eng
-li	l i
-lia	l ia
-lian	l ian
-liang	l iang
-liao	l iao
-lie	l ie
-lin	l in
-ling	l ing
-liu	l iu
-lo	l o
-long	l ong
-lou	l ou
-lu	l u
-luan	l uan
-lun	l un
-luo	l uo
-lv	l v
-lve	l ve
-ma	m a
-mai	m ai
-man	m an
-mang	m ang
-mao	m ao
-me	m e
-mei	m ei
-men	m en
-meng	m eng
-mi	m i
-mian	m ian
-miao	m iao
-mie	m ie
-min	m in
-ming	m ing
-miu	m iu
-mo	m o
-mou	m ou
-mu	m u
-na	n a
-nai	n ai
-nan	n an
-nang	n ang
-nao	n ao
-ne	n e
-nei	n ei
-nen	n en
-neng	n eng
-ni	n i
-nian	n ian
-niang	n iang
-niao	n iao
-nie	n ie
-nin	n in
-ning	n ing
-niu	n iu
-nong	n ong
-nou	n ou
-nu	n u
-nuan	n uan
-nun	n un
-nuo	n uo
-nv	n v
-nve	n ve
-o	OO o
-ou	OO ou
-pa	p a
-pai	p ai
-pan	p an
-pang	p ang
-pao	p ao
-pei	p ei
-pen	p en
-peng	p eng
-pi	p i
-pian	p ian
-piao	p iao
-pie	p ie
-pin	p in
-ping	p ing
-po	p o
-pou	p ou
-pu	p u
-qi	q i
-qia	q ia
-qian	q ian
-qiang	q iang
-qiao	q iao
-qie	q ie
-qin	q in
-qing	q ing
-qiong	q iong
-qiu	q iu
-qu	q v
-qv	q v
-quan	q van
-qvan	q van
-que	q ve
-qve	q ve
-qun	q vn
-qvn	q vn
-ran	r an
-rang	r ang
-rao	r ao
-re	r e
-ren	r en
-reng	r eng
-ri	r ir
-rong	r ong
-rou	r ou
-ru	r u
-rua	r ua
-ruan	r uan
-rui	r ui
-run	r un
-ruo	r uo
-sa	s a
-sai	s ai
-san	s an
-sang	s ang
-sao	s ao
-se	s e
-sen	s en
-seng	s eng
-sha	sh a
-shai	sh ai
-shan	sh an
-shang	sh ang
-shao	sh ao
-she	sh e
-shei	sh ei
-shen	sh en
-sheng	sh eng
-shi	sh ir
-shou	sh ou
-shu	sh u
-shua	sh ua
-shuai	sh uai
-shuan	sh uan
-shuang	sh uang
-shui	sh ui
-shun	sh un
-shuo	sh uo
-si	s i0
-song	s ong
-sou	s ou
-su	s u
-suan	s uan
-sui	s ui
-sun	s un
-suo	s uo
-ta	t a
-tai	t ai
-tan	t an
-tang	t ang
-tao	t ao
-te	t e
-tei	t ei
-teng	t eng
-ti	t i
-tian	t ian
-tiao	t iao
-tie	t ie
-ting	t ing
-tong	t ong
-tou	t ou
-tu	t u
-tuan	t uan
-tui	t ui
-tun	t un
-tuo	t uo
-wa	w a
-wai	w ai
-wan	w an
-wang	w ang
-wei	w ei
-wen	w en
-weng	w eng
-wo	w o
-wu	w u
-xi	x i
-xia	x ia
-xian	x ian
-xiang	x iang
-xiao	x iao
-xie	x ie
-xin	x in
-xing	x ing
-xiong	x iong
-xiu	x iu
-xu	x v
-xv	x v
-xuan	x van
-xvan	x van
-xue	x ve
-xve	x ve
-xun	x vn
-xvn	x vn
-ya	y a
-yan	y En
-yang	y ang
-yao	y ao
-ye	y E
-yi	y i
-yin	y in
-ying	y ing
-yo	y o
-yong	y ong
-you	y ou
-yu	y v
-yv	y v
-yuan	y van
-yvan	y van
-yue	y ve
-yve	y ve
-yun	y vn
-yvn	y vn
-za	z a
-zai	z ai
-zan	z an
-zang	z ang
-zao	z ao
-ze	z e
-zei	z ei
-zen	z en
-zeng	z eng
-zha	zh a
-zhai	zh ai
-zhan	zh an
-zhang	zh ang
-zhao	zh ao
-zhe	zh e
-zhei	zh ei
-zhen	zh en
-zheng	zh eng
-zhi	zh ir
-zhong	zh ong
-zhou	zh ou
-zhu	zh u
-zhua	zh ua
-zhuai	zh uai
-zhuan	zh uan
-zhuang	zh uang
-zhui	zh ui
-zhun	zh un
-zhuo	zh uo
-zi	z i0
-zong	z ong
-zou	z ou
-zu	z u
-zuan	z uan
-zui	z ui
-zun	z un
-zuo	z uo

onnx_modules/V200/text/symbols.py DELETED Viewed

@@ -1,187 +0,0 @@
-punctuation = ["!", "?", "…", ",", ".", "'", "-"]
-pu_symbols = punctuation + ["SP", "UNK"]
-pad = "_"
-# chinese
-zh_symbols = [
-    "E",
-    "En",
-    "a",
-    "ai",
-    "an",
-    "ang",
-    "ao",
-    "b",
-    "c",
-    "ch",
-    "d",
-    "e",
-    "ei",
-    "en",
-    "eng",
-    "er",
-    "f",
-    "g",
-    "h",
-    "i",
-    "i0",
-    "ia",
-    "ian",
-    "iang",
-    "iao",
-    "ie",
-    "in",
-    "ing",
-    "iong",
-    "ir",
-    "iu",
-    "j",
-    "k",
-    "l",
-    "m",
-    "n",
-    "o",
-    "ong",
-    "ou",
-    "p",
-    "q",
-    "r",
-    "s",
-    "sh",
-    "t",
-    "u",
-    "ua",
-    "uai",
-    "uan",
-    "uang",
-    "ui",
-    "un",
-    "uo",
-    "v",
-    "van",
-    "ve",
-    "vn",
-    "w",
-    "x",
-    "y",
-    "z",
-    "zh",
-    "AA",
-    "EE",
-    "OO",
-]
-num_zh_tones = 6
-# japanese
-ja_symbols = [
-    "N",
-    "a",
-    "a:",
-    "b",
-    "by",
-    "ch",
-    "d",
-    "dy",
-    "e",
-    "e:",
-    "f",
-    "g",
-    "gy",
-    "h",
-    "hy",
-    "i",
-    "i:",
-    "j",
-    "k",
-    "ky",
-    "m",
-    "my",
-    "n",
-    "ny",
-    "o",
-    "o:",
-    "p",
-    "py",
-    "q",
-    "r",
-    "ry",
-    "s",
-    "sh",
-    "t",
-    "ts",
-    "ty",
-    "u",
-    "u:",
-    "w",
-    "y",
-    "z",
-    "zy",
-]
-num_ja_tones = 2
-# English
-en_symbols = [
-    "aa",
-    "ae",
-    "ah",
-    "ao",
-    "aw",
-    "ay",
-    "b",
-    "ch",
-    "d",
-    "dh",
-    "eh",
-    "er",
-    "ey",
-    "f",
-    "g",
-    "hh",
-    "ih",
-    "iy",
-    "jh",
-    "k",
-    "l",
-    "m",
-    "n",
-    "ng",
-    "ow",
-    "oy",
-    "p",
-    "r",
-    "s",
-    "sh",
-    "t",
-    "th",
-    "uh",
-    "uw",
-    "V",
-    "w",
-    "y",
-    "z",
-    "zh",
-]
-num_en_tones = 4
-# combine all symbols
-normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
-symbols = [pad] + normal_symbols + pu_symbols
-sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
-# combine all tones
-num_tones = num_zh_tones + num_ja_tones + num_en_tones
-# language maps
-language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
-num_languages = len(language_id_map.keys())
-language_tone_start_map = {
-    "ZH": 0,
-    "JP": num_zh_tones,
-    "EN": num_zh_tones + num_ja_tones,
-}
-if __name__ == "__main__":
-    a = set(zh_symbols)
-    b = set(en_symbols)
-    print(sorted(a & b))

onnx_modules/V200/text/tone_sandhi.py DELETED Viewed

@@ -1,769 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List
-from typing import Tuple
-import jieba
-from pypinyin import lazy_pinyin
-from pypinyin import Style
-class ToneSandhi:
-    def __init__(self):
-        self.must_neural_tone_words = {
-            "麻烦",
-            "麻利",
-            "鸳鸯",
-            "高粱",
-            "骨头",
-            "骆驼",
-            "马虎",
-            "首饰",
-            "馒头",
-            "馄饨",
-            "风筝",
-            "难为",
-            "队伍",
-            "阔气",
-            "闺女",
-            "门道",
-            "锄头",
-            "铺盖",
-            "铃铛",
-            "铁匠",
-            "钥匙",
-            "里脊",
-            "里头",
-            "部分",
-            "那么",
-            "道士",
-            "造化",
-            "迷糊",
-            "连累",
-            "这么",
-            "这个",
-            "运气",
-            "过去",
-            "软和",
-            "转悠",
-            "踏实",
-            "跳蚤",
-            "跟头",
-            "趔趄",
-            "财主",
-            "豆腐",
-            "讲究",
-            "记性",
-            "记号",
-            "认识",
-            "规矩",
-            "见识",
-            "裁缝",
-            "补丁",
-            "衣裳",
-            "衣服",
-            "衙门",
-            "街坊",
-            "行李",
-            "行当",
-            "蛤蟆",
-            "蘑菇",
-            "薄荷",
-            "葫芦",
-            "葡萄",
-            "萝卜",
-            "荸荠",
-            "苗条",
-            "苗头",
-            "苍蝇",
-            "芝麻",
-            "舒服",
-            "舒坦",
-            "舌头",
-            "自在",
-            "膏药",
-            "脾气",
-            "脑袋",
-            "脊梁",
-            "能耐",
-            "胳膊",
-            "胭脂",
-            "胡萝",
-            "胡琴",
-            "胡同",
-            "聪明",
-            "耽误",
-            "耽搁",
-            "耷拉",
-            "耳朵",
-            "老爷",
-            "老实",
-            "老婆",
-            "老头",
-            "老太",
-            "翻腾",
-            "罗嗦",
-            "罐头",
-            "编辑",
-            "结实",
-            "红火",
-            "累赘",
-            "糨糊",
-            "糊涂",
-            "精神",
-            "粮食",
-            "簸箕",
-            "篱笆",
-            "算计",
-            "算盘",
-            "答应",
-            "笤帚",
-            "笑语",
-            "笑话",
-            "窟窿",
-            "窝囊",
-            "窗户",
-            "稳当",
-            "稀罕",
-            "称呼",
-            "秧歌",
-            "秀气",
-            "秀才",
-            "福气",
-            "祖宗",
-            "砚台",
-            "码头",
-            "石榴",
-            "石头",
-            "石匠",
-            "知识",
-            "眼睛",
-            "眯缝",
-            "眨巴",
-            "眉毛",
-            "相声",
-            "盘算",
-            "白净",
-            "痢疾",
-            "痛快",
-            "疟疾",
-            "疙瘩",
-            "疏忽",
-            "畜生",
-            "生意",
-            "甘蔗",
-            "琵琶",
-            "琢磨",
-            "琉璃",
-            "玻璃",
-            "玫瑰",
-            "玄乎",
-            "狐狸",
-            "状元",
-            "特务",
-            "牲口",
-            "牙碜",
-            "牌楼",
-            "爽快",
-            "爱人",
-            "热闹",
-            "烧饼",
-            "烟筒",
-            "烂糊",
-            "点心",
-            "炊帚",
-            "灯笼",
-            "火候",
-            "漂亮",
-            "滑溜",
-            "溜达",
-            "温和",
-            "清楚",
-            "消息",
-            "浪头",
-            "活泼",
-            "比方",
-            "正经",
-            "欺负",
-            "模糊",
-            "槟榔",
-            "棺材",
-            "棒槌",
-            "棉花",
-            "核桃",
-            "栅栏",
-            "柴火",
-            "架势",
-            "枕头",
-            "枇杷",
-            "机灵",
-            "本事",
-            "木头",
-            "木匠",
-            "朋友",
-            "月饼",
-            "月亮",
-            "暖和",
-            "明白",
-            "时候",
-            "新鲜",
-            "故事",
-            "收拾",
-            "收成",
-            "提防",
-            "挖苦",
-            "挑剔",
-            "指甲",
-            "指头",
-            "拾掇",
-            "拳头",
-            "拨弄",
-            "招牌",
-            "招呼",
-            "抬举",
-            "护士",
-            "折腾",
-            "扫帚",
-            "打量",
-            "打算",
-            "打点",
-            "打扮",
-            "打听",
-            "打发",
-            "扎实",
-            "扁担",
-            "戒指",
-            "懒得",
-            "意识",
-            "意思",
-            "情形",
-            "悟性",
-            "怪物",
-            "思量",
-            "怎么",
-            "念头",
-            "念叨",
-            "快活",
-            "忙活",
-            "志气",
-            "心思",
-            "得罪",
-            "张罗",
-            "弟兄",
-            "开通",
-            "应酬",
-            "庄稼",
-            "干事",
-            "帮手",
-            "帐篷",
-            "希罕",
-            "师父",
-            "师傅",
-            "巴结",
-            "巴掌",
-            "差事",
-            "工夫",
-            "岁数",
-            "屁股",
-            "尾巴",
-            "少爷",
-            "小气",
-            "小伙",
-            "将就",
-            "对头",
-            "对付",
-            "寡妇",
-            "家伙",
-            "客气",
-            "实在",
-            "官司",
-            "学问",
-            "学生",
-            "字号",
-            "嫁妆",
-            "媳妇",
-            "媒人",
-            "婆家",
-            "娘家",
-            "委屈",
-            "姑娘",
-            "姐夫",
-            "妯娌",
-            "妥当",
-            "妖精",
-            "奴才",
-            "女婿",
-            "头发",
-            "太阳",
-            "大爷",
-            "大方",
-            "大意",
-            "大夫",
-            "多少",
-            "多么",
-            "外甥",
-            "壮实",
-            "地道",
-            "地方",
-            "在乎",
-            "困难",
-            "嘴巴",
-            "嘱咐",
-            "嘟囔",
-            "嘀咕",
-            "喜欢",
-            "喇嘛",
-            "喇叭",
-            "商量",
-            "唾沫",
-            "哑巴",
-            "哈欠",
-            "哆嗦",
-            "咳嗽",
-            "和尚",
-            "告诉",
-            "告示",
-            "含糊",
-            "吓唬",
-            "后头",
-            "名字",
-            "名堂",
-            "合同",
-            "吆喝",
-            "叫唤",
-            "口袋",
-            "厚道",
-            "厉害",
-            "千斤",
-            "包袱",
-            "包涵",
-            "匀称",
-            "勤快",
-            "动静",
-            "动弹",
-            "功夫",
-            "力气",
-            "前头",
-            "刺猬",
-            "刺激",
-            "别扭",
-            "利落",
-            "利索",
-            "利害",
-            "分析",
-            "出息",
-            "凑合",
-            "凉快",
-            "冷战",
-            "冤枉",
-            "冒失",
-            "养活",
-            "关系",
-            "先生",
-            "兄弟",
-            "便宜",
-            "使唤",
-            "佩服",
-            "作坊",
-            "体面",
-            "位置",
-            "似的",
-            "伙计",
-            "休息",
-            "什么",
-            "人家",
-            "亲戚",
-            "亲家",
-            "交情",
-            "云彩",
-            "事情",
-            "买卖",
-            "主意",
-            "丫头",
-            "丧气",
-            "两口",
-            "东西",
-            "东家",
-            "世故",
-            "不由",
-            "不在",
-            "下水",
-            "下巴",
-            "上头",
-            "上司",
-            "丈夫",
-            "丈人",
-            "一辈",
-            "那个",
-            "菩萨",
-            "父亲",
-            "母亲",
-            "咕噜",
-            "邋遢",
-            "费用",
-            "冤家",
-            "甜头",
-            "介绍",
-            "荒唐",
-            "大人",
-            "泥鳅",
-            "幸福",
-            "熟悉",
-            "计划",
-            "扑腾",
-            "蜡烛",
-            "姥爷",
-            "照顾",
-            "喉咙",
-            "吉他",
-            "弄堂",
-            "蚂蚱",
-            "凤凰",
-            "拖沓",
-            "寒碜",
-            "糟蹋",
-            "倒腾",
-            "报复",
-            "逻辑",
-            "盘缠",
-            "喽啰",
-            "牢骚",
-            "咖喱",
-            "扫把",
-            "惦记",
-        }
-        self.must_not_neural_tone_words = {
-            "男子",
-            "女子",
-            "分子",
-            "原子",
-            "量子",
-            "莲子",
-            "石子",
-            "瓜子",
-            "电子",
-            "人人",
-            "虎虎",
-        }
-        self.punc = "：，；。？！“”‘’':,;.?!"
-    # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
-    # e.g.
-    # word: "家里"
-    # pos: "s"
-    # finals: ['ia1', 'i3']
-    def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
-        # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
-        for j, item in enumerate(word):
-            if (
-                j - 1 >= 0
-                and item == word[j - 1]
-                and pos[0] in {"n", "v", "a"}
-                and word not in self.must_not_neural_tone_words
-            ):
-                finals[j] = finals[j][:-1] + "5"
-        ge_idx = word.find("个")
-        if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
-            finals[-1] = finals[-1][:-1] + "5"
-        elif len(word) >= 1 and word[-1] in "的地得":
-            finals[-1] = finals[-1][:-1] + "5"
-        # e.g. 走了, 看着, 去过
-        # elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
-        #     finals[-1] = finals[-1][:-1] + "5"
-        elif (
-            len(word) > 1
-            and word[-1] in "们子"
-            and pos in {"r", "n"}
-            and word not in self.must_not_neural_tone_words
-        ):
-            finals[-1] = finals[-1][:-1] + "5"
-        # e.g. 桌上, 地下, 家里
-        elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
-            finals[-1] = finals[-1][:-1] + "5"
-        # e.g. 上来, 下去
-        elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
-            finals[-1] = finals[-1][:-1] + "5"
-        # 个做量词
-        elif (
-            ge_idx >= 1
-            and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
-        ) or word == "个":
-            finals[ge_idx] = finals[ge_idx][:-1] + "5"
-        else:
-            if (
-                word in self.must_neural_tone_words
-                or word[-2:] in self.must_neural_tone_words
-            ):
-                finals[-1] = finals[-1][:-1] + "5"
-        word_list = self._split_word(word)
-        finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
-        for i, word in enumerate(word_list):
-            # conventional neural in Chinese
-            if (
-                word in self.must_neural_tone_words
-                or word[-2:] in self.must_neural_tone_words
-            ):
-                finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
-        finals = sum(finals_list, [])
-        return finals
-    def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
-        # e.g. 看不懂
-        if len(word) == 3 and word[1] == "不":
-            finals[1] = finals[1][:-1] + "5"
-        else:
-            for i, char in enumerate(word):
-                # "不" before tone4 should be bu2, e.g. 不怕
-                if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
-                    finals[i] = finals[i][:-1] + "2"
-        return finals
-    def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
-        # "一" in number sequences, e.g. 一零零, 二一零
-        if word.find("一") != -1 and all(
-            [item.isnumeric() for item in word if item != "一"]
-        ):
-            return finals
-        # "一" between reduplication words should be yi5, e.g. 看一看
-        elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
-            finals[1] = finals[1][:-1] + "5"
-        # when "一" is ordinal word, it should be yi1
-        elif word.startswith("第一"):
-            finals[1] = finals[1][:-1] + "1"
-        else:
-            for i, char in enumerate(word):
-                if char == "一" and i + 1 < len(word):
-                    # "一" before tone4 should be yi2, e.g. 一段
-                    if finals[i + 1][-1] == "4":
-                        finals[i] = finals[i][:-1] + "2"
-                    # "一" before non-tone4 should be yi4, e.g. 一天
-                    else:
-                        # "一" 后面如果是标点，还读一声
-                        if word[i + 1] not in self.punc:
-                            finals[i] = finals[i][:-1] + "4"
-        return finals
-    def _split_word(self, word: str) -> List[str]:
-        word_list = jieba.cut_for_search(word)
-        word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
-        first_subword = word_list[0]
-        first_begin_idx = word.find(first_subword)
-        if first_begin_idx == 0:
-            second_subword = word[len(first_subword) :]
-            new_word_list = [first_subword, second_subword]
-        else:
-            second_subword = word[: -len(first_subword)]
-            new_word_list = [second_subword, first_subword]
-        return new_word_list
-    def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
-        if len(word) == 2 and self._all_tone_three(finals):
-            finals[0] = finals[0][:-1] + "2"
-        elif len(word) == 3:
-            word_list = self._split_word(word)
-            if self._all_tone_three(finals):
-                #  disyllabic + monosyllabic, e.g. 蒙古/包
-                if len(word_list[0]) == 2:
-                    finals[0] = finals[0][:-1] + "2"
-                    finals[1] = finals[1][:-1] + "2"
-                #  monosyllabic + disyllabic, e.g. 纸/老虎
-                elif len(word_list[0]) == 1:
-                    finals[1] = finals[1][:-1] + "2"
-            else:
-                finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
-                if len(finals_list) == 2:
-                    for i, sub in enumerate(finals_list):
-                        # e.g. 所有/人
-                        if self._all_tone_three(sub) and len(sub) == 2:
-                            finals_list[i][0] = finals_list[i][0][:-1] + "2"
-                        # e.g. 好/喜欢
-                        elif (
-                            i == 1
-                            and not self._all_tone_three(sub)
-                            and finals_list[i][0][-1] == "3"
-                            and finals_list[0][-1][-1] == "3"
-                        ):
-                            finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
-                        finals = sum(finals_list, [])
-        # split idiom into two words who's length is 2
-        elif len(word) == 4:
-            finals_list = [finals[:2], finals[2:]]
-            finals = []
-            for sub in finals_list:
-                if self._all_tone_three(sub):
-                    sub[0] = sub[0][:-1] + "2"
-                finals += sub
-        return finals
-    def _all_tone_three(self, finals: List[str]) -> bool:
-        return all(x[-1] == "3" for x in finals)
-    # merge "不" and the word behind it
-    # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
-    def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
-        new_seg = []
-        last_word = ""
-        for word, pos in seg:
-            if last_word == "不":
-                word = last_word + word
-            if word != "不":
-                new_seg.append((word, pos))
-            last_word = word[:]
-        if last_word == "不":
-            new_seg.append((last_word, "d"))
-            last_word = ""
-        return new_seg
-    # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
-    # function 2: merge single  "一" and the word behind it
-    # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
-    # e.g.
-    # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
-    # output seg: [['听一听', 'v']]
-    def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
-        new_seg = []
-        # function 1
-        for i, (word, pos) in enumerate(seg):
-            if (
-                i - 1 >= 0
-                and word == "一"
-                and i + 1 < len(seg)
-                and seg[i - 1][0] == seg[i + 1][0]
-                and seg[i - 1][1] == "v"
-            ):
-                new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
-            else:
-                if (
-                    i - 2 >= 0
-                    and seg[i - 1][0] == "一"
-                    and seg[i - 2][0] == word
-                    and pos == "v"
-                ):
-                    continue
-                else:
-                    new_seg.append([word, pos])
-        seg = new_seg
-        new_seg = []
-        # function 2
-        for i, (word, pos) in enumerate(seg):
-            if new_seg and new_seg[-1][0] == "一":
-                new_seg[-1][0] = new_seg[-1][0] + word
-            else:
-                new_seg.append([word, pos])
-        return new_seg
-    # the first and the second words are all_tone_three
-    def _merge_continuous_three_tones(
-        self, seg: List[Tuple[str, str]]
-    ) -> List[Tuple[str, str]]:
-        new_seg = []
-        sub_finals_list = [
-            lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
-            for (word, pos) in seg
-        ]
-        assert len(sub_finals_list) == len(seg)
-        merge_last = [False] * len(seg)
-        for i, (word, pos) in enumerate(seg):
-            if (
-                i - 1 >= 0
-                and self._all_tone_three(sub_finals_list[i - 1])
-                and self._all_tone_three(sub_finals_list[i])
-                and not merge_last[i - 1]
-            ):
-                # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
-                if (
-                    not self._is_reduplication(seg[i - 1][0])
-                    and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
-                ):
-                    new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
-                    merge_last[i] = True
-                else:
-                    new_seg.append([word, pos])
-            else:
-                new_seg.append([word, pos])
-        return new_seg
-    def _is_reduplication(self, word: str) -> bool:
-        return len(word) == 2 and word[0] == word[1]
-    # the last char of first word and the first char of second word is tone_three
-    def _merge_continuous_three_tones_2(
-        self, seg: List[Tuple[str, str]]
-    ) -> List[Tuple[str, str]]:
-        new_seg = []
-        sub_finals_list = [
-            lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
-            for (word, pos) in seg
-        ]
-        assert len(sub_finals_list) == len(seg)
-        merge_last = [False] * len(seg)
-        for i, (word, pos) in enumerate(seg):
-            if (
-                i - 1 >= 0
-                and sub_finals_list[i - 1][-1][-1] == "3"
-                and sub_finals_list[i][0][-1] == "3"
-                and not merge_last[i - 1]
-            ):
-                # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
-                if (
-                    not self._is_reduplication(seg[i - 1][0])
-                    and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
-                ):
-                    new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
-                    merge_last[i] = True
-                else:
-                    new_seg.append([word, pos])
-            else:
-                new_seg.append([word, pos])
-        return new_seg
-    def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
-        new_seg = []
-        for i, (word, pos) in enumerate(seg):
-            if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
-                new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
-            else:
-                new_seg.append([word, pos])
-        return new_seg
-    def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
-        new_seg = []
-        for i, (word, pos) in enumerate(seg):
-            if new_seg and word == new_seg[-1][0]:
-                new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
-            else:
-                new_seg.append([word, pos])
-        return new_seg
-    def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
-        seg = self._merge_bu(seg)
-        try:
-            seg = self._merge_yi(seg)
-        except:
-            print("_merge_yi failed")
-        seg = self._merge_reduplication(seg)
-        seg = self._merge_continuous_three_tones(seg)
-        seg = self._merge_continuous_three_tones_2(seg)
-        seg = self._merge_er(seg)
-        return seg
-    def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
-        finals = self._bu_sandhi(word, finals)
-        finals = self._yi_sandhi(word, finals)
-        finals = self._neural_sandhi(word, pos, finals)
-        finals = self._three_sandhi(word, finals)
-        return finals

onnx_modules/V210/__init__.py DELETED Viewed

File without changes

onnx_modules/V210/attentions_onnx.py DELETED Viewed

@@ -1,378 +0,0 @@
-import math
-import torch
-from torch import nn
-from torch.nn import functional as F
-import commons
-import logging
-logger = logging.getLogger(__name__)
-class LayerNorm(nn.Module):
-    def __init__(self, channels, eps=1e-5):
-        super().__init__()
-        self.channels = channels
-        self.eps = eps
-        self.gamma = nn.Parameter(torch.ones(channels))
-        self.beta = nn.Parameter(torch.zeros(channels))
-    def forward(self, x):
-        x = x.transpose(1, -1)
-        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
-        return x.transpose(1, -1)
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-    n_channels_int = n_channels[0]
-    in_act = input_a + input_b
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size=1,
-        p_dropout=0.0,
-        window_size=4,
-        isflow=True,
-        **kwargs
-    ):
-        super().__init__()
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        # if isflow:
-        #  cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
-        #  self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
-        #  self.cond_layer = weight_norm(cond_layer, name='weight')
-        #  self.gin_channels = 256
-        self.cond_layer_idx = self.n_layers
-        if "gin_channels" in kwargs:
-            self.gin_channels = kwargs["gin_channels"]
-            if self.gin_channels != 0:
-                self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
-                # vits2 says 3rd block, so idx is 2 by default
-                self.cond_layer_idx = (
-                    kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
-                )
-                logging.debug(self.gin_channels, self.cond_layer_idx)
-                assert (
-                    self.cond_layer_idx < self.n_layers
-                ), "cond_layer_idx should be less than n_layers"
-        self.drop = nn.Dropout(p_dropout)
-        self.attn_layers = nn.ModuleList()
-        self.norm_layers_1 = nn.ModuleList()
-        self.ffn_layers = nn.ModuleList()
-        self.norm_layers_2 = nn.ModuleList()
-        for i in range(self.n_layers):
-            self.attn_layers.append(
-                MultiHeadAttention(
-                    hidden_channels,
-                    hidden_channels,
-                    n_heads,
-                    p_dropout=p_dropout,
-                    window_size=window_size,
-                )
-            )
-            self.norm_layers_1.append(LayerNorm(hidden_channels))
-            self.ffn_layers.append(
-                FFN(
-                    hidden_channels,
-                    hidden_channels,
-                    filter_channels,
-                    kernel_size,
-                    p_dropout=p_dropout,
-                )
-            )
-            self.norm_layers_2.append(LayerNorm(hidden_channels))
-    def forward(self, x, x_mask, g=None):
-        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-        x = x * x_mask
-        for i in range(self.n_layers):
-            if i == self.cond_layer_idx and g is not None:
-                g = self.spk_emb_linear(g.transpose(1, 2))
-                g = g.transpose(1, 2)
-                x = x + g
-                x = x * x_mask
-            y = self.attn_layers[i](x, x, attn_mask)
-            y = self.drop(y)
-            x = self.norm_layers_1[i](x + y)
-            y = self.ffn_layers[i](x, x_mask)
-            y = self.drop(y)
-            x = self.norm_layers_2[i](x + y)
-        x = x * x_mask
-        return x
-class MultiHeadAttention(nn.Module):
-    def __init__(
-        self,
-        channels,
-        out_channels,
-        n_heads,
-        p_dropout=0.0,
-        window_size=None,
-        heads_share=True,
-        block_length=None,
-        proximal_bias=False,
-        proximal_init=False,
-    ):
-        super().__init__()
-        assert channels % n_heads == 0
-        self.channels = channels
-        self.out_channels = out_channels
-        self.n_heads = n_heads
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        self.heads_share = heads_share
-        self.block_length = block_length
-        self.proximal_bias = proximal_bias
-        self.proximal_init = proximal_init
-        self.attn = None
-        self.k_channels = channels // n_heads
-        self.conv_q = nn.Conv1d(channels, channels, 1)
-        self.conv_k = nn.Conv1d(channels, channels, 1)
-        self.conv_v = nn.Conv1d(channels, channels, 1)
-        self.conv_o = nn.Conv1d(channels, out_channels, 1)
-        self.drop = nn.Dropout(p_dropout)
-        if window_size is not None:
-            n_heads_rel = 1 if heads_share else n_heads
-            rel_stddev = self.k_channels**-0.5
-            self.emb_rel_k = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
-                * rel_stddev
-            )
-            self.emb_rel_v = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
-                * rel_stddev
-            )
-        nn.init.xavier_uniform_(self.conv_q.weight)
-        nn.init.xavier_uniform_(self.conv_k.weight)
-        nn.init.xavier_uniform_(self.conv_v.weight)
-        if proximal_init:
-            with torch.no_grad():
-                self.conv_k.weight.copy_(self.conv_q.weight)
-                self.conv_k.bias.copy_(self.conv_q.bias)
-    def forward(self, x, c, attn_mask=None):
-        q = self.conv_q(x)
-        k = self.conv_k(c)
-        v = self.conv_v(c)
-        x, self.attn = self.attention(q, k, v, mask=attn_mask)
-        x = self.conv_o(x)
-        return x
-    def attention(self, query, key, value, mask=None):
-        # reshape [b, d, t] -> [b, n_h, t, d_k]
-        b, d, t_s, t_t = (*key.size(), query.size(2))
-        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
-        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
-        if self.window_size is not None:
-            assert (
-                t_s == t_t
-            ), "Relative attention is only available for self-attention."
-            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
-            rel_logits = self._matmul_with_relative_keys(
-                query / math.sqrt(self.k_channels), key_relative_embeddings
-            )
-            scores_local = self._relative_position_to_absolute_position(rel_logits)
-            scores = scores + scores_local
-        if self.proximal_bias:
-            assert t_s == t_t, "Proximal bias is only available for self-attention."
-            scores = scores + self._attention_bias_proximal(t_s).to(
-                device=scores.device, dtype=scores.dtype
-            )
-        if mask is not None:
-            scores = scores.masked_fill(mask == 0, -1e4)
-            if self.block_length is not None:
-                assert (
-                    t_s == t_t
-                ), "Local attention is only available for self-attention."
-                block_mask = (
-                    torch.ones_like(scores)
-                    .triu(-self.block_length)
-                    .tril(self.block_length)
-                )
-                scores = scores.masked_fill(block_mask == 0, -1e4)
-        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
-        p_attn = self.drop(p_attn)
-        output = torch.matmul(p_attn, value)
-        if self.window_size is not None:
-            relative_weights = self._absolute_position_to_relative_position(p_attn)
-            value_relative_embeddings = self._get_relative_embeddings(
-                self.emb_rel_v, t_s
-            )
-            output = output + self._matmul_with_relative_values(
-                relative_weights, value_relative_embeddings
-            )
-        output = (
-            output.transpose(2, 3).contiguous().view(b, d, t_t)
-        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
-        return output, p_attn
-    def _matmul_with_relative_values(self, x, y):
-        """
-        x: [b, h, l, m]
-        y: [h or 1, m, d]
-        ret: [b, h, l, d]
-        """
-        ret = torch.matmul(x, y.unsqueeze(0))
-        return ret
-    def _matmul_with_relative_keys(self, x, y):
-        """
-        x: [b, h, l, d]
-        y: [h or 1, m, d]
-        ret: [b, h, l, m]
-        """
-        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
-        return ret
-    def _get_relative_embeddings(self, relative_embeddings, length):
-        max_relative_position = 2 * self.window_size + 1
-        # Pad first before slice to avoid using cond ops.
-        pad_length = max(length - (self.window_size + 1), 0)
-        slice_start_position = max((self.window_size + 1) - length, 0)
-        slice_end_position = slice_start_position + 2 * length - 1
-        if pad_length > 0:
-            padded_relative_embeddings = F.pad(
-                relative_embeddings,
-                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
-            )
-        else:
-            padded_relative_embeddings = relative_embeddings
-        used_relative_embeddings = padded_relative_embeddings[
-            :, slice_start_position:slice_end_position
-        ]
-        return used_relative_embeddings
-    def _relative_position_to_absolute_position(self, x):
-        """
-        x: [b, h, l, 2*l-1]
-        ret: [b, h, l, l]
-        """
-        batch, heads, length, _ = x.size()
-        # Concat columns of pad to shift from relative to absolute indexing.
-        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
-        # Concat extra elements so to add up to shape (len+1, 2*len-1).
-        x_flat = x.view([batch, heads, length * 2 * length])
-        x_flat = F.pad(
-            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
-        )
-        # Reshape and slice out the padded elements.
-        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
-            :, :, :length, length - 1 :
-        ]
-        return x_final
-    def _absolute_position_to_relative_position(self, x):
-        """
-        x: [b, h, l, l]
-        ret: [b, h, l, 2*l-1]
-        """
-        batch, heads, length, _ = x.size()
-        # padd along column
-        x = F.pad(
-            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
-        )
-        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
-        # add 0's in the beginning that will skew the elements after reshape
-        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
-        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
-        return x_final
-    def _attention_bias_proximal(self, length):
-        """Bias for self-attention to encourage attention to close positions.
-        Args:
-          length: an integer scalar.
-        Returns:
-          a Tensor with shape [1, 1, length, length]
-        """
-        r = torch.arange(length, dtype=torch.float32)
-        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
-class FFN(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        filter_channels,
-        kernel_size,
-        p_dropout=0.0,
-        activation=None,
-        causal=False,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.activation = activation
-        self.causal = causal
-        if causal:
-            self.padding = self._causal_padding
-        else:
-            self.padding = self._same_padding
-        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
-        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
-        self.drop = nn.Dropout(p_dropout)
-    def forward(self, x, x_mask):
-        x = self.conv_1(self.padding(x * x_mask))
-        if self.activation == "gelu":
-            x = x * torch.sigmoid(1.702 * x)
-        else:
-            x = torch.relu(x)
-        x = self.drop(x)
-        x = self.conv_2(self.padding(x * x_mask))
-        return x * x_mask
-    def _causal_padding(self, x):
-        if self.kernel_size == 1:
-            return x
-        pad_l = self.kernel_size - 1
-        pad_r = 0
-        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
-        x = F.pad(x, commons.convert_pad_shape(padding))
-        return x
-    def _same_padding(self, x):
-        if self.kernel_size == 1:
-            return x
-        pad_l = (self.kernel_size - 1) // 2
-        pad_r = self.kernel_size // 2
-        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
-        x = F.pad(x, commons.convert_pad_shape(padding))
-        return x

onnx_modules/V210/models_onnx.py DELETED Viewed

@@ -1,1044 +0,0 @@
-import math
-import torch
-from torch import nn
-from torch.nn import functional as F
-import commons
-import modules
-from . import attentions_onnx
-from vector_quantize_pytorch import VectorQuantize
-from torch.nn import Conv1d, ConvTranspose1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from commons import init_weights, get_padding
-from .text import symbols, num_tones, num_languages
-class DurationDiscriminator(nn.Module):  # vits2
-    def __init__(
-        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.gin_channels = gin_channels
-        self.drop = nn.Dropout(p_dropout)
-        self.conv_1 = nn.Conv1d(
-            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.norm_1 = modules.LayerNorm(filter_channels)
-        self.conv_2 = nn.Conv1d(
-            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.norm_2 = modules.LayerNorm(filter_channels)
-        self.dur_proj = nn.Conv1d(1, filter_channels, 1)
-        self.pre_out_conv_1 = nn.Conv1d(
-            2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
-        self.pre_out_conv_2 = nn.Conv1d(
-            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
-        self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid())
-    def forward_probability(self, x, x_mask, dur, g=None):
-        dur = self.dur_proj(dur)
-        x = torch.cat([x, dur], dim=1)
-        x = self.pre_out_conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.pre_out_norm_1(x)
-        x = self.drop(x)
-        x = self.pre_out_conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.pre_out_norm_2(x)
-        x = self.drop(x)
-        x = x * x_mask
-        x = x.transpose(1, 2)
-        output_prob = self.output_layer(x)
-        return output_prob
-    def forward(self, x, x_mask, dur_r, dur_hat, g=None):
-        x = torch.detach(x)
-        if g is not None:
-            g = torch.detach(g)
-            x = x + self.cond(g)
-        x = self.conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_1(x)
-        x = self.drop(x)
-        x = self.conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_2(x)
-        x = self.drop(x)
-        output_probs = []
-        for dur in [dur_r, dur_hat]:
-            output_prob = self.forward_probability(x, x_mask, dur, g)
-            output_probs.append(output_prob)
-        return output_probs
-class TransformerCouplingBlock(nn.Module):
-    def __init__(
-        self,
-        channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        n_flows=4,
-        gin_channels=0,
-        share_parameter=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-        self.flows = nn.ModuleList()
-        self.wn = (
-            attentions_onnx.FFT(
-                hidden_channels,
-                filter_channels,
-                n_heads,
-                n_layers,
-                kernel_size,
-                p_dropout,
-                isflow=True,
-                gin_channels=self.gin_channels,
-            )
-            if share_parameter
-            else None
-        )
-        for i in range(n_flows):
-            self.flows.append(
-                modules.TransformerCouplingLayer(
-                    channels,
-                    hidden_channels,
-                    kernel_size,
-                    n_layers,
-                    n_heads,
-                    p_dropout,
-                    filter_channels,
-                    mean_only=True,
-                    wn_sharing_parameter=self.wn,
-                    gin_channels=self.gin_channels,
-                )
-            )
-            self.flows.append(modules.Flip())
-    def forward(self, x, x_mask, g=None, reverse=True):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x
-class StochasticDurationPredictor(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        filter_channels,
-        kernel_size,
-        p_dropout,
-        n_flows=4,
-        gin_channels=0,
-    ):
-        super().__init__()
-        filter_channels = in_channels  # it needs to be removed from future version.
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-        self.log_flow = modules.Log()
-        self.flows = nn.ModuleList()
-        self.flows.append(modules.ElementwiseAffine(2))
-        for i in range(n_flows):
-            self.flows.append(
-                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
-            )
-            self.flows.append(modules.Flip())
-        self.post_pre = nn.Conv1d(1, filter_channels, 1)
-        self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.post_convs = modules.DDSConv(
-            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
-        )
-        self.post_flows = nn.ModuleList()
-        self.post_flows.append(modules.ElementwiseAffine(2))
-        for i in range(4):
-            self.post_flows.append(
-                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
-            )
-            self.post_flows.append(modules.Flip())
-        self.pre = nn.Conv1d(in_channels, filter_channels, 1)
-        self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.convs = modules.DDSConv(
-            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
-        )
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
-    def forward(self, x, x_mask, z, g=None):
-        x = torch.detach(x)
-        x = self.pre(x)
-        if g is not None:
-            g = torch.detach(g)
-            x = x + self.cond(g)
-        x = self.convs(x, x_mask)
-        x = self.proj(x) * x_mask
-        flows = list(reversed(self.flows))
-        flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
-        for flow in flows:
-            z = flow(z, x_mask, g=x, reverse=True)
-        z0, z1 = torch.split(z, [1, 1], 1)
-        logw = z0
-        return logw
-class DurationPredictor(nn.Module):
-    def __init__(
-        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.gin_channels = gin_channels
-        self.drop = nn.Dropout(p_dropout)
-        self.conv_1 = nn.Conv1d(
-            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.norm_1 = modules.LayerNorm(filter_channels)
-        self.conv_2 = nn.Conv1d(
-            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.norm_2 = modules.LayerNorm(filter_channels)
-        self.proj = nn.Conv1d(filter_channels, 1, 1)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
-    def forward(self, x, x_mask, g=None):
-        x = torch.detach(x)
-        if g is not None:
-            g = torch.detach(g)
-            x = x + self.cond(g)
-        x = self.conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_1(x)
-        x = self.drop(x)
-        x = self.conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_2(x)
-        x = self.drop(x)
-        x = self.proj(x * x_mask)
-        return x * x_mask
-class TextEncoder(nn.Module):
-    def __init__(
-        self,
-        n_vocab,
-        out_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        n_speakers,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.n_vocab = n_vocab
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.gin_channels = gin_channels
-        self.emb = nn.Embedding(len(symbols), hidden_channels)
-        nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
-        self.tone_emb = nn.Embedding(num_tones, hidden_channels)
-        nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels**-0.5)
-        self.language_emb = nn.Embedding(num_languages, hidden_channels)
-        nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5)
-        self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
-        self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
-        self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
-        self.emo_proj = nn.Linear(1024, 1024)
-        self.emo_quantizer = nn.ModuleList()
-        for i in range(0, n_speakers):
-            self.emo_quantizer.append(
-                VectorQuantize(
-                    dim=1024,
-                    codebook_size=10,
-                    decay=0.8,
-                    commitment_weight=1.0,
-                    learnable_codebook=True,
-                    ema_update=False,
-                )
-            )
-        self.emo_q_proj = nn.Linear(1024, hidden_channels)
-        self.n_speakers = n_speakers
-        self.encoder = attentions_onnx.Encoder(
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout,
-            gin_channels=self.gin_channels,
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    def init_vq(self):
-        self.emb_vq = nn.Embedding(10 * self.n_speakers, 1024)
-        self.emb_vq_weight = torch.zeros(10 * self.n_speakers, 1024).float()
-        for i in range(self.n_speakers):
-            for j in range(10):
-                self.emb_vq_weight[i * 10 + j] = self.emo_quantizer[
-                    i
-                ].get_output_from_indices(torch.LongTensor([j]))
-        self.emb_vq.weight = nn.Parameter(self.emb_vq_weight.clone())
-    def forward(
-        self,
-        x,
-        x_lengths,
-        tone,
-        language,
-        bert,
-        ja_bert,
-        en_bert,
-        g=None,
-        vqidx=None,
-        sid=None,
-    ):
-        x_mask = torch.ones_like(x).unsqueeze(0)
-        bert_emb = self.bert_proj(bert.transpose(0, 1).unsqueeze(0)).transpose(1, 2)
-        ja_bert_emb = self.ja_bert_proj(ja_bert.transpose(0, 1).unsqueeze(0)).transpose(
-            1, 2
-        )
-        en_bert_emb = self.en_bert_proj(en_bert.transpose(0, 1).unsqueeze(0)).transpose(
-            1, 2
-        )
-        emb_vq_idx = torch.clamp(
-            (sid * 10) + vqidx, min=0, max=(self.n_speakers * 10) - 1
-        )
-        vqval = self.emb_vq(emb_vq_idx)
-        x = (
-            self.emb(x)
-            + self.tone_emb(tone)
-            + self.language_emb(language)
-            + bert_emb
-            + ja_bert_emb
-            + en_bert_emb
-            + self.emo_q_proj(vqval)
-        ) * math.sqrt(
-            self.hidden_channels
-        )  # [b, t, h]
-        x = torch.transpose(x, 1, -1)  # [b, h, t]
-        x_mask = x_mask.to(x.dtype)
-        x = self.encoder(x * x_mask, x_mask, g=g)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        return x, m, logs, x_mask
-class ResidualCouplingBlock(nn.Module):
-    def __init__(
-        self,
-        channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        n_flows=4,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-        self.flows = nn.ModuleList()
-        for i in range(n_flows):
-            self.flows.append(
-                modules.ResidualCouplingLayer(
-                    channels,
-                    hidden_channels,
-                    kernel_size,
-                    dilation_rate,
-                    n_layers,
-                    gin_channels=gin_channels,
-                    mean_only=True,
-                )
-            )
-            self.flows.append(modules.Flip())
-    def forward(self, x, x_mask, g=None, reverse=True):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x
-class PosteriorEncoder(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.gin_channels = gin_channels
-        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-        self.enc = modules.WN(
-            hidden_channels,
-            kernel_size,
-            dilation_rate,
-            n_layers,
-            gin_channels=gin_channels,
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    def forward(self, x, x_lengths, g=None):
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
-            x.dtype
-        )
-        x = self.pre(x) * x_mask
-        x = self.enc(x, x_mask, g=g)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
-        return z, m, logs, x_mask
-class Generator(torch.nn.Module):
-    def __init__(
-        self,
-        initial_channel,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        gin_channels=0,
-    ):
-        super(Generator, self).__init__()
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-        self.conv_pre = Conv1d(
-            initial_channel, upsample_initial_channel, 7, 1, padding=3
-        )
-        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(
-                weight_norm(
-                    ConvTranspose1d(
-                        upsample_initial_channel // (2**i),
-                        upsample_initial_channel // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(
-                zip(resblock_kernel_sizes, resblock_dilation_sizes)
-            ):
-                self.resblocks.append(resblock(ch, k, d))
-        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-        self.ups.apply(init_weights)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-    def forward(self, x, g=None):
-        x = self.conv_pre(x)
-        if g is not None:
-            x = x + self.cond(g)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            x = self.ups[i](x)
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-    def remove_weight_norm(self):
-        print("Removing weight norm...")
-        for layer in self.ups:
-            remove_weight_norm(layer)
-        for layer in self.resblocks:
-            layer.remove_weight_norm()
-class DiscriminatorP(torch.nn.Module):
-    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
-        self.period = period
-        self.use_spectral_norm = use_spectral_norm
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(
-                    Conv2d(
-                        1,
-                        32,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        32,
-                        128,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        128,
-                        512,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        512,
-                        1024,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        1024,
-                        1024,
-                        (kernel_size, 1),
-                        1,
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-            ]
-        )
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-    def forward(self, x):
-        fmap = []
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), "reflect")
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-class DiscriminatorS(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super(DiscriminatorS, self).__init__()
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
-                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
-                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
-                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
-                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
-                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-            ]
-        )
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-    def forward(self, x):
-        fmap = []
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-class MultiPeriodDiscriminator(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super(MultiPeriodDiscriminator, self).__init__()
-        periods = [2, 3, 5, 7, 11]
-        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
-        discs = discs + [
-            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
-        ]
-        self.discriminators = nn.ModuleList(discs)
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            y_d_gs.append(y_d_g)
-            fmap_rs.append(fmap_r)
-            fmap_gs.append(fmap_g)
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-class ReferenceEncoder(nn.Module):
-    """
-    inputs --- [N, Ty/r, n_mels*r]  mels
-    outputs --- [N, ref_enc_gru_size]
-    """
-    def __init__(self, spec_channels, gin_channels=0):
-        super().__init__()
-        self.spec_channels = spec_channels
-        ref_enc_filters = [32, 32, 64, 64, 128, 128]
-        K = len(ref_enc_filters)
-        filters = [1] + ref_enc_filters
-        convs = [
-            weight_norm(
-                nn.Conv2d(
-                    in_channels=filters[i],
-                    out_channels=filters[i + 1],
-                    kernel_size=(3, 3),
-                    stride=(2, 2),
-                    padding=(1, 1),
-                )
-            )
-            for i in range(K)
-        ]
-        self.convs = nn.ModuleList(convs)
-        # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501
-        out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
-        self.gru = nn.GRU(
-            input_size=ref_enc_filters[-1] * out_channels,
-            hidden_size=256 // 2,
-            batch_first=True,
-        )
-        self.proj = nn.Linear(128, gin_channels)
-    def forward(self, inputs, mask=None):
-        N = inputs.size(0)
-        out = inputs.view(N, 1, -1, self.spec_channels)  # [N, 1, Ty, n_freqs]
-        for conv in self.convs:
-            out = conv(out)
-            # out = wn(out)
-            out = F.relu(out)  # [N, 128, Ty//2^K, n_mels//2^K]
-        out = out.transpose(1, 2)  # [N, Ty//2^K, 128, n_mels//2^K]
-        T = out.size(1)
-        N = out.size(0)
-        out = out.contiguous().view(N, T, -1)  # [N, Ty//2^K, 128*n_mels//2^K]
-        self.gru.flatten_parameters()
-        memory, out = self.gru(out)  # out --- [1, N, 128]
-        return self.proj(out.squeeze(0))
-    def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
-        for i in range(n_convs):
-            L = (L - kernel_size + 2 * pad) // stride + 1
-        return L
-class SynthesizerTrn(nn.Module):
-    """
-    Synthesizer for Training
-    """
-    def __init__(
-        self,
-        n_vocab,
-        spec_channels,
-        segment_size,
-        inter_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        n_speakers=256,
-        gin_channels=256,
-        use_sdp=True,
-        n_flow_layer=4,
-        n_layers_trans_flow=4,
-        flow_share_parameter=False,
-        use_transformer_flow=True,
-        **kwargs,
-    ):
-        super().__init__()
-        self.n_vocab = n_vocab
-        self.spec_channels = spec_channels
-        self.inter_channels = inter_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.resblock = resblock
-        self.resblock_kernel_sizes = resblock_kernel_sizes
-        self.resblock_dilation_sizes = resblock_dilation_sizes
-        self.upsample_rates = upsample_rates
-        self.upsample_initial_channel = upsample_initial_channel
-        self.upsample_kernel_sizes = upsample_kernel_sizes
-        self.segment_size = segment_size
-        self.n_speakers = n_speakers
-        self.gin_channels = gin_channels
-        self.n_layers_trans_flow = n_layers_trans_flow
-        self.use_spk_conditioned_encoder = kwargs.get(
-            "use_spk_conditioned_encoder", True
-        )
-        self.use_sdp = use_sdp
-        self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
-        self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
-        self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
-        self.current_mas_noise_scale = self.mas_noise_scale_initial
-        if self.use_spk_conditioned_encoder and gin_channels > 0:
-            self.enc_gin_channels = gin_channels
-        self.enc_p = TextEncoder(
-            n_vocab,
-            inter_channels,
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout,
-            n_speakers,
-            gin_channels=self.enc_gin_channels,
-        )
-        self.dec = Generator(
-            inter_channels,
-            resblock,
-            resblock_kernel_sizes,
-            resblock_dilation_sizes,
-            upsample_rates,
-            upsample_initial_channel,
-            upsample_kernel_sizes,
-            gin_channels=gin_channels,
-        )
-        self.enc_q = PosteriorEncoder(
-            spec_channels,
-            inter_channels,
-            hidden_channels,
-            5,
-            1,
-            16,
-            gin_channels=gin_channels,
-        )
-        if use_transformer_flow:
-            self.flow = TransformerCouplingBlock(
-                inter_channels,
-                hidden_channels,
-                filter_channels,
-                n_heads,
-                n_layers_trans_flow,
-                5,
-                p_dropout,
-                n_flow_layer,
-                gin_channels=gin_channels,
-                share_parameter=flow_share_parameter,
-            )
-        else:
-            self.flow = ResidualCouplingBlock(
-                inter_channels,
-                hidden_channels,
-                5,
-                1,
-                n_flow_layer,
-                gin_channels=gin_channels,
-            )
-        self.sdp = StochasticDurationPredictor(
-            hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
-        )
-        self.dp = DurationPredictor(
-            hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
-        )
-        if n_speakers >= 1:
-            self.emb_g = nn.Embedding(n_speakers, gin_channels)
-        else:
-            self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
-    def export_onnx(
-        self,
-        path,
-        max_len=None,
-        sdp_ratio=0,
-        y=None,
-    ):
-        noise_scale = 0.667
-        length_scale = 1
-        noise_scale_w = 0.8
-        x = (
-            torch.LongTensor(
-                [
-                    0,
-                    97,
-                    0,
-                    8,
-                    0,
-                    78,
-                    0,
-                    8,
-                    0,
-                    76,
-                    0,
-                    37,
-                    0,
-                    40,
-                    0,
-                    97,
-                    0,
-                    8,
-                    0,
-                    23,
-                    0,
-                    8,
-                    0,
-                    74,
-                    0,
-                    26,
-                    0,
-                    104,
-                    0,
-                ]
-            )
-            .unsqueeze(0)
-            .cpu()
-        )
-        tone = torch.zeros_like(x).cpu()
-        language = torch.zeros_like(x).cpu()
-        x_lengths = torch.LongTensor([x.shape[1]]).cpu()
-        sid = torch.LongTensor([0]).cpu()
-        bert = torch.randn(size=(x.shape[1], 1024)).cpu()
-        ja_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
-        en_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
-        if self.n_speakers > 0:
-            g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
-            torch.onnx.export(
-                self.emb_g,
-                (sid),
-                f"onnx/{path}/{path}_emb.onnx",
-                input_names=["sid"],
-                output_names=["g"],
-                verbose=True,
-            )
-        else:
-            g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
-        self.enc_p.init_vq()
-        torch.onnx.export(
-            self.enc_p,
-            (x, x_lengths, tone, language, bert, ja_bert, en_bert, g, sid, sid),
-            f"onnx/{path}/{path}_enc_p.onnx",
-            input_names=[
-                "x",
-                "x_lengths",
-                "t",
-                "language",
-                "bert_0",
-                "bert_1",
-                "bert_2",
-                "g",
-                "vqidx",
-                "sid",
-            ],
-            output_names=["xout", "m_p", "logs_p", "x_mask"],
-            dynamic_axes={
-                "x": [0, 1],
-                "t": [0, 1],
-                "language": [0, 1],
-                "bert_0": [0],
-                "bert_1": [0],
-                "bert_2": [0],
-                "xout": [0, 2],
-                "m_p": [0, 2],
-                "logs_p": [0, 2],
-                "x_mask": [0, 2],
-            },
-            verbose=True,
-            opset_version=16,
-        )
-        x, m_p, logs_p, x_mask = self.enc_p(
-            x, x_lengths, tone, language, bert, ja_bert, en_bert, g, sid, sid
-        )
-        zinput = (
-            torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
-            * noise_scale_w
-        )
-        torch.onnx.export(
-            self.sdp,
-            (x, x_mask, zinput, g),
-            f"onnx/{path}/{path}_sdp.onnx",
-            input_names=["x", "x_mask", "zin", "g"],
-            output_names=["logw"],
-            dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "zin": [0, 2], "logw": [0, 2]},
-            verbose=True,
-        )
-        torch.onnx.export(
-            self.dp,
-            (x, x_mask, g),
-            f"onnx/{path}/{path}_dp.onnx",
-            input_names=["x", "x_mask", "g"],
-            output_names=["logw"],
-            dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "logw": [0, 2]},
-            verbose=True,
-        )
-        logw = self.sdp(x, x_mask, zinput, g=g) * (sdp_ratio) + self.dp(
-            x, x_mask, g=g
-        ) * (1 - sdp_ratio)
-        w = torch.exp(logw) * x_mask * length_scale
-        w_ceil = torch.ceil(w)
-        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
-        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
-            x_mask.dtype
-        )
-        attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
-        attn = commons.generate_path(w_ceil, attn_mask)
-        m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
-            1, 2
-        )  # [b, t', t], [b, t, d] -> [b, d, t']
-        logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
-            1, 2
-        )  # [b, t', t], [b, t, d] -> [b, d, t']
-        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
-        torch.onnx.export(
-            self.flow,
-            (z_p, y_mask, g),
-            f"onnx/{path}/{path}_flow.onnx",
-            input_names=["z_p", "y_mask", "g"],
-            output_names=["z"],
-            dynamic_axes={"z_p": [0, 2], "y_mask": [0, 2], "z": [0, 2]},
-            verbose=True,
-        )
-        z = self.flow(z_p, y_mask, g=g, reverse=True)
-        z_in = (z * y_mask)[:, :, :max_len]
-        torch.onnx.export(
-            self.dec,
-            (z_in, g),
-            f"onnx/{path}/{path}_dec.onnx",
-            input_names=["z_in", "g"],
-            output_names=["o"],
-            dynamic_axes={"z_in": [0, 2], "o": [0, 2]},
-            verbose=True,
-        )
-        o = self.dec((z * y_mask)[:, :, :max_len], g=g)

onnx_modules/V210/text/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .symbols import *

onnx_modules/V210/text/symbols.py DELETED Viewed

@@ -1,187 +0,0 @@
-punctuation = ["!", "?", "…", ",", ".", "'", "-"]
-pu_symbols = punctuation + ["SP", "UNK"]
-pad = "_"
-# chinese
-zh_symbols = [
-    "E",
-    "En",
-    "a",
-    "ai",
-    "an",
-    "ang",
-    "ao",
-    "b",
-    "c",
-    "ch",
-    "d",
-    "e",
-    "ei",
-    "en",
-    "eng",
-    "er",
-    "f",
-    "g",
-    "h",
-    "i",
-    "i0",
-    "ia",
-    "ian",
-    "iang",
-    "iao",
-    "ie",
-    "in",
-    "ing",
-    "iong",
-    "ir",
-    "iu",
-    "j",
-    "k",
-    "l",
-    "m",
-    "n",
-    "o",
-    "ong",
-    "ou",
-    "p",
-    "q",
-    "r",
-    "s",
-    "sh",
-    "t",
-    "u",
-    "ua",
-    "uai",
-    "uan",
-    "uang",
-    "ui",
-    "un",
-    "uo",
-    "v",
-    "van",
-    "ve",
-    "vn",
-    "w",
-    "x",
-    "y",
-    "z",
-    "zh",
-    "AA",
-    "EE",
-    "OO",
-]
-num_zh_tones = 6
-# japanese
-ja_symbols = [
-    "N",
-    "a",
-    "a:",
-    "b",
-    "by",
-    "ch",
-    "d",
-    "dy",
-    "e",
-    "e:",
-    "f",
-    "g",
-    "gy",
-    "h",
-    "hy",
-    "i",
-    "i:",
-    "j",
-    "k",
-    "ky",
-    "m",
-    "my",
-    "n",
-    "ny",
-    "o",
-    "o:",
-    "p",
-    "py",
-    "q",
-    "r",
-    "ry",
-    "s",
-    "sh",
-    "t",
-    "ts",
-    "ty",
-    "u",
-    "u:",
-    "w",
-    "y",
-    "z",
-    "zy",
-]
-num_ja_tones = 2
-# English
-en_symbols = [
-    "aa",
-    "ae",
-    "ah",
-    "ao",
-    "aw",
-    "ay",
-    "b",
-    "ch",
-    "d",
-    "dh",
-    "eh",
-    "er",
-    "ey",
-    "f",
-    "g",
-    "hh",
-    "ih",
-    "iy",
-    "jh",
-    "k",
-    "l",
-    "m",
-    "n",
-    "ng",
-    "ow",
-    "oy",
-    "p",
-    "r",
-    "s",
-    "sh",
-    "t",
-    "th",
-    "uh",
-    "uw",
-    "V",
-    "w",
-    "y",
-    "z",
-    "zh",
-]
-num_en_tones = 4
-# combine all symbols
-normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
-symbols = [pad] + normal_symbols + pu_symbols
-sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
-# combine all tones
-num_tones = num_zh_tones + num_ja_tones + num_en_tones
-# language maps
-language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
-num_languages = len(language_id_map.keys())
-language_tone_start_map = {
-    "ZH": 0,
-    "JP": num_zh_tones,
-    "EN": num_zh_tones + num_ja_tones,
-}
-if __name__ == "__main__":
-    a = set(zh_symbols)
-    b = set(en_symbols)
-    print(sorted(a & b))

onnx_modules/V220/__init__.py DELETED Viewed

File without changes

onnx_modules/V220/attentions_onnx.py DELETED Viewed

@@ -1,378 +0,0 @@
-import math
-import torch
-from torch import nn
-from torch.nn import functional as F
-import commons
-import logging
-logger = logging.getLogger(__name__)
-class LayerNorm(nn.Module):
-    def __init__(self, channels, eps=1e-5):
-        super().__init__()
-        self.channels = channels
-        self.eps = eps
-        self.gamma = nn.Parameter(torch.ones(channels))
-        self.beta = nn.Parameter(torch.zeros(channels))
-    def forward(self, x):
-        x = x.transpose(1, -1)
-        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
-        return x.transpose(1, -1)
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-    n_channels_int = n_channels[0]
-    in_act = input_a + input_b
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size=1,
-        p_dropout=0.0,
-        window_size=4,
-        isflow=True,
-        **kwargs
-    ):
-        super().__init__()
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        # if isflow:
-        #  cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
-        #  self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
-        #  self.cond_layer = weight_norm(cond_layer, name='weight')
-        #  self.gin_channels = 256
-        self.cond_layer_idx = self.n_layers
-        if "gin_channels" in kwargs:
-            self.gin_channels = kwargs["gin_channels"]
-            if self.gin_channels != 0:
-                self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
-                # vits2 says 3rd block, so idx is 2 by default
-                self.cond_layer_idx = (
-                    kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
-                )
-                logging.debug(self.gin_channels, self.cond_layer_idx)
-                assert (
-                    self.cond_layer_idx < self.n_layers
-                ), "cond_layer_idx should be less than n_layers"
-        self.drop = nn.Dropout(p_dropout)
-        self.attn_layers = nn.ModuleList()
-        self.norm_layers_1 = nn.ModuleList()
-        self.ffn_layers = nn.ModuleList()
-        self.norm_layers_2 = nn.ModuleList()
-        for i in range(self.n_layers):
-            self.attn_layers.append(
-                MultiHeadAttention(
-                    hidden_channels,
-                    hidden_channels,
-                    n_heads,
-                    p_dropout=p_dropout,
-                    window_size=window_size,
-                )
-            )
-            self.norm_layers_1.append(LayerNorm(hidden_channels))
-            self.ffn_layers.append(
-                FFN(
-                    hidden_channels,
-                    hidden_channels,
-                    filter_channels,
-                    kernel_size,
-                    p_dropout=p_dropout,
-                )
-            )
-            self.norm_layers_2.append(LayerNorm(hidden_channels))
-    def forward(self, x, x_mask, g=None):
-        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-        x = x * x_mask
-        for i in range(self.n_layers):
-            if i == self.cond_layer_idx and g is not None:
-                g = self.spk_emb_linear(g.transpose(1, 2))
-                g = g.transpose(1, 2)
-                x = x + g
-                x = x * x_mask
-            y = self.attn_layers[i](x, x, attn_mask)
-            y = self.drop(y)
-            x = self.norm_layers_1[i](x + y)
-            y = self.ffn_layers[i](x, x_mask)
-            y = self.drop(y)
-            x = self.norm_layers_2[i](x + y)
-        x = x * x_mask
-        return x
-class MultiHeadAttention(nn.Module):
-    def __init__(
-        self,
-        channels,
-        out_channels,
-        n_heads,
-        p_dropout=0.0,
-        window_size=None,
-        heads_share=True,
-        block_length=None,
-        proximal_bias=False,
-        proximal_init=False,
-    ):
-        super().__init__()
-        assert channels % n_heads == 0
-        self.channels = channels
-        self.out_channels = out_channels
-        self.n_heads = n_heads
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        self.heads_share = heads_share
-        self.block_length = block_length
-        self.proximal_bias = proximal_bias
-        self.proximal_init = proximal_init
-        self.attn = None
-        self.k_channels = channels // n_heads
-        self.conv_q = nn.Conv1d(channels, channels, 1)
-        self.conv_k = nn.Conv1d(channels, channels, 1)
-        self.conv_v = nn.Conv1d(channels, channels, 1)
-        self.conv_o = nn.Conv1d(channels, out_channels, 1)
-        self.drop = nn.Dropout(p_dropout)
-        if window_size is not None:
-            n_heads_rel = 1 if heads_share else n_heads
-            rel_stddev = self.k_channels**-0.5
-            self.emb_rel_k = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
-                * rel_stddev
-            )
-            self.emb_rel_v = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
-                * rel_stddev
-            )
-        nn.init.xavier_uniform_(self.conv_q.weight)
-        nn.init.xavier_uniform_(self.conv_k.weight)
-        nn.init.xavier_uniform_(self.conv_v.weight)
-        if proximal_init:
-            with torch.no_grad():
-                self.conv_k.weight.copy_(self.conv_q.weight)
-                self.conv_k.bias.copy_(self.conv_q.bias)
-    def forward(self, x, c, attn_mask=None):
-        q = self.conv_q(x)
-        k = self.conv_k(c)
-        v = self.conv_v(c)
-        x, self.attn = self.attention(q, k, v, mask=attn_mask)
-        x = self.conv_o(x)
-        return x
-    def attention(self, query, key, value, mask=None):
-        # reshape [b, d, t] -> [b, n_h, t, d_k]
-        b, d, t_s, t_t = (*key.size(), query.size(2))
-        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
-        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
-        if self.window_size is not None:
-            assert (
-                t_s == t_t
-            ), "Relative attention is only available for self-attention."
-            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
-            rel_logits = self._matmul_with_relative_keys(
-                query / math.sqrt(self.k_channels), key_relative_embeddings
-            )
-            scores_local = self._relative_position_to_absolute_position(rel_logits)
-            scores = scores + scores_local
-        if self.proximal_bias:
-            assert t_s == t_t, "Proximal bias is only available for self-attention."
-            scores = scores + self._attention_bias_proximal(t_s).to(
-                device=scores.device, dtype=scores.dtype
-            )
-        if mask is not None:
-            scores = scores.masked_fill(mask == 0, -1e4)
-            if self.block_length is not None:
-                assert (
-                    t_s == t_t
-                ), "Local attention is only available for self-attention."
-                block_mask = (
-                    torch.ones_like(scores)
-                    .triu(-self.block_length)
-                    .tril(self.block_length)
-                )
-                scores = scores.masked_fill(block_mask == 0, -1e4)
-        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
-        p_attn = self.drop(p_attn)
-        output = torch.matmul(p_attn, value)
-        if self.window_size is not None:
-            relative_weights = self._absolute_position_to_relative_position(p_attn)
-            value_relative_embeddings = self._get_relative_embeddings(
-                self.emb_rel_v, t_s
-            )
-            output = output + self._matmul_with_relative_values(
-                relative_weights, value_relative_embeddings
-            )
-        output = (
-            output.transpose(2, 3).contiguous().view(b, d, t_t)
-        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
-        return output, p_attn
-    def _matmul_with_relative_values(self, x, y):
-        """
-        x: [b, h, l, m]
-        y: [h or 1, m, d]
-        ret: [b, h, l, d]
-        """
-        ret = torch.matmul(x, y.unsqueeze(0))
-        return ret
-    def _matmul_with_relative_keys(self, x, y):
-        """
-        x: [b, h, l, d]
-        y: [h or 1, m, d]
-        ret: [b, h, l, m]
-        """
-        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
-        return ret
-    def _get_relative_embeddings(self, relative_embeddings, length):
-        max_relative_position = 2 * self.window_size + 1
-        # Pad first before slice to avoid using cond ops.
-        pad_length = max(length - (self.window_size + 1), 0)
-        slice_start_position = max((self.window_size + 1) - length, 0)
-        slice_end_position = slice_start_position + 2 * length - 1
-        if pad_length > 0:
-            padded_relative_embeddings = F.pad(
-                relative_embeddings,
-                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
-            )
-        else:
-            padded_relative_embeddings = relative_embeddings
-        used_relative_embeddings = padded_relative_embeddings[
-            :, slice_start_position:slice_end_position
-        ]
-        return used_relative_embeddings
-    def _relative_position_to_absolute_position(self, x):
-        """
-        x: [b, h, l, 2*l-1]
-        ret: [b, h, l, l]
-        """
-        batch, heads, length, _ = x.size()
-        # Concat columns of pad to shift from relative to absolute indexing.
-        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
-        # Concat extra elements so to add up to shape (len+1, 2*len-1).
-        x_flat = x.view([batch, heads, length * 2 * length])
-        x_flat = F.pad(
-            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
-        )
-        # Reshape and slice out the padded elements.
-        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
-            :, :, :length, length - 1 :
-        ]
-        return x_final
-    def _absolute_position_to_relative_position(self, x):
-        """
-        x: [b, h, l, l]
-        ret: [b, h, l, 2*l-1]
-        """
-        batch, heads, length, _ = x.size()
-        # padd along column
-        x = F.pad(
-            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
-        )
-        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
-        # add 0's in the beginning that will skew the elements after reshape
-        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
-        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
-        return x_final
-    def _attention_bias_proximal(self, length):
-        """Bias for self-attention to encourage attention to close positions.
-        Args:
-          length: an integer scalar.
-        Returns:
-          a Tensor with shape [1, 1, length, length]
-        """
-        r = torch.arange(length, dtype=torch.float32)
-        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
-class FFN(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        filter_channels,
-        kernel_size,
-        p_dropout=0.0,
-        activation=None,
-        causal=False,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.activation = activation
-        self.causal = causal
-        if causal:
-            self.padding = self._causal_padding
-        else:
-            self.padding = self._same_padding
-        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
-        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
-        self.drop = nn.Dropout(p_dropout)
-    def forward(self, x, x_mask):
-        x = self.conv_1(self.padding(x * x_mask))
-        if self.activation == "gelu":
-            x = x * torch.sigmoid(1.702 * x)
-        else:
-            x = torch.relu(x)
-        x = self.drop(x)
-        x = self.conv_2(self.padding(x * x_mask))
-        return x * x_mask
-    def _causal_padding(self, x):
-        if self.kernel_size == 1:
-            return x
-        pad_l = self.kernel_size - 1
-        pad_r = 0
-        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
-        x = F.pad(x, commons.convert_pad_shape(padding))
-        return x
-    def _same_padding(self, x):
-        if self.kernel_size == 1:
-            return x
-        pad_l = (self.kernel_size - 1) // 2
-        pad_r = self.kernel_size // 2
-        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
-        x = F.pad(x, commons.convert_pad_shape(padding))
-        return x

onnx_modules/V220/models_onnx.py DELETED Viewed

@@ -1,1076 +0,0 @@
-import math
-import torch
-from torch import nn
-from torch.nn import functional as F
-import commons
-import modules
-from . import attentions_onnx
-from vector_quantize_pytorch import VectorQuantize
-from torch.nn import Conv1d, ConvTranspose1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from commons import init_weights, get_padding
-from .text import symbols, num_tones, num_languages
-class DurationDiscriminator(nn.Module):  # vits2
-    def __init__(
-        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.gin_channels = gin_channels
-        self.drop = nn.Dropout(p_dropout)
-        self.conv_1 = nn.Conv1d(
-            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.norm_1 = modules.LayerNorm(filter_channels)
-        self.conv_2 = nn.Conv1d(
-            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.norm_2 = modules.LayerNorm(filter_channels)
-        self.dur_proj = nn.Conv1d(1, filter_channels, 1)
-        self.pre_out_conv_1 = nn.Conv1d(
-            2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
-        self.pre_out_conv_2 = nn.Conv1d(
-            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
-        self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid())
-    def forward_probability(self, x, x_mask, dur, g=None):
-        dur = self.dur_proj(dur)
-        x = torch.cat([x, dur], dim=1)
-        x = self.pre_out_conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.pre_out_norm_1(x)
-        x = self.drop(x)
-        x = self.pre_out_conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.pre_out_norm_2(x)
-        x = self.drop(x)
-        x = x * x_mask
-        x = x.transpose(1, 2)
-        output_prob = self.output_layer(x)
-        return output_prob
-    def forward(self, x, x_mask, dur_r, dur_hat, g=None):
-        x = torch.detach(x)
-        if g is not None:
-            g = torch.detach(g)
-            x = x + self.cond(g)
-        x = self.conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_1(x)
-        x = self.drop(x)
-        x = self.conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_2(x)
-        x = self.drop(x)
-        output_probs = []
-        for dur in [dur_r, dur_hat]:
-            output_prob = self.forward_probability(x, x_mask, dur, g)
-            output_probs.append(output_prob)
-        return output_probs
-class TransformerCouplingBlock(nn.Module):
-    def __init__(
-        self,
-        channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        n_flows=4,
-        gin_channels=0,
-        share_parameter=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-        self.flows = nn.ModuleList()
-        self.wn = (
-            attentions_onnx.FFT(
-                hidden_channels,
-                filter_channels,
-                n_heads,
-                n_layers,
-                kernel_size,
-                p_dropout,
-                isflow=True,
-                gin_channels=self.gin_channels,
-            )
-            if share_parameter
-            else None
-        )
-        for i in range(n_flows):
-            self.flows.append(
-                modules.TransformerCouplingLayer(
-                    channels,
-                    hidden_channels,
-                    kernel_size,
-                    n_layers,
-                    n_heads,
-                    p_dropout,
-                    filter_channels,
-                    mean_only=True,
-                    wn_sharing_parameter=self.wn,
-                    gin_channels=self.gin_channels,
-                )
-            )
-            self.flows.append(modules.Flip())
-    def forward(self, x, x_mask, g=None, reverse=True):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x
-class StochasticDurationPredictor(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        filter_channels,
-        kernel_size,
-        p_dropout,
-        n_flows=4,
-        gin_channels=0,
-    ):
-        super().__init__()
-        filter_channels = in_channels  # it needs to be removed from future version.
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-        self.log_flow = modules.Log()
-        self.flows = nn.ModuleList()
-        self.flows.append(modules.ElementwiseAffine(2))
-        for i in range(n_flows):
-            self.flows.append(
-                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
-            )
-            self.flows.append(modules.Flip())
-        self.post_pre = nn.Conv1d(1, filter_channels, 1)
-        self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.post_convs = modules.DDSConv(
-            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
-        )
-        self.post_flows = nn.ModuleList()
-        self.post_flows.append(modules.ElementwiseAffine(2))
-        for i in range(4):
-            self.post_flows.append(
-                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
-            )
-            self.post_flows.append(modules.Flip())
-        self.pre = nn.Conv1d(in_channels, filter_channels, 1)
-        self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.convs = modules.DDSConv(
-            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
-        )
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
-    def forward(self, x, x_mask, z, g=None):
-        x = torch.detach(x)
-        x = self.pre(x)
-        if g is not None:
-            g = torch.detach(g)
-            x = x + self.cond(g)
-        x = self.convs(x, x_mask)
-        x = self.proj(x) * x_mask
-        flows = list(reversed(self.flows))
-        flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
-        for flow in flows:
-            z = flow(z, x_mask, g=x, reverse=True)
-        z0, z1 = torch.split(z, [1, 1], 1)
-        logw = z0
-        return logw
-class DurationPredictor(nn.Module):
-    def __init__(
-        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.gin_channels = gin_channels
-        self.drop = nn.Dropout(p_dropout)
-        self.conv_1 = nn.Conv1d(
-            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.norm_1 = modules.LayerNorm(filter_channels)
-        self.conv_2 = nn.Conv1d(
-            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
-        self.norm_2 = modules.LayerNorm(filter_channels)
-        self.proj = nn.Conv1d(filter_channels, 1, 1)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
-    def forward(self, x, x_mask, g=None):
-        x = torch.detach(x)
-        if g is not None:
-            g = torch.detach(g)
-            x = x + self.cond(g)
-        x = self.conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_1(x)
-        x = self.drop(x)
-        x = self.conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_2(x)
-        x = self.drop(x)
-        x = self.proj(x * x_mask)
-        return x * x_mask
-class Bottleneck(nn.Sequential):
-    def __init__(self, in_dim, hidden_dim):
-        c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
-        c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
-        super().__init__(*[c_fc1, c_fc2])
-class Block(nn.Module):
-    def __init__(self, in_dim, hidden_dim) -> None:
-        super().__init__()
-        self.norm = nn.LayerNorm(in_dim)
-        self.mlp = MLP(in_dim, hidden_dim)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = x + self.mlp(self.norm(x))
-        return x
-class MLP(nn.Module):
-    def __init__(self, in_dim, hidden_dim):
-        super().__init__()
-        self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
-        self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
-        self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False)
-    def forward(self, x: torch.Tensor):
-        x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
-        x = self.c_proj(x)
-        return x
-class TextEncoder(nn.Module):
-    def __init__(
-        self,
-        n_vocab,
-        out_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        n_speakers,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.n_vocab = n_vocab
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.gin_channels = gin_channels
-        self.emb = nn.Embedding(len(symbols), hidden_channels)
-        nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
-        self.tone_emb = nn.Embedding(num_tones, hidden_channels)
-        nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels**-0.5)
-        self.language_emb = nn.Embedding(num_languages, hidden_channels)
-        nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5)
-        self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
-        self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
-        self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
-        # self.emo_proj = nn.Linear(1024, 1024)
-        # self.emo_quantizer = nn.ModuleList()
-        # for i in range(0, n_speakers):
-        #    self.emo_quantizer.append(
-        #        VectorQuantize(
-        #            dim=1024,
-        #            codebook_size=10,
-        #            decay=0.8,
-        #            commitment_weight=1.0,
-        #            learnable_codebook=True,
-        #            ema_update=False,
-        #        )
-        #    )
-        # self.emo_q_proj = nn.Linear(1024, hidden_channels)
-        self.n_speakers = n_speakers
-        self.in_feature_net = nn.Sequential(
-            # input is assumed to an already normalized embedding
-            nn.Linear(512, 1028, bias=False),
-            nn.GELU(),
-            nn.LayerNorm(1028),
-            *[Block(1028, 512) for _ in range(1)],
-            nn.Linear(1028, 512, bias=False),
-            # normalize before passing to VQ?
-            # nn.GELU(),
-            # nn.LayerNorm(512),
-        )
-        self.emo_vq = VectorQuantize(
-            dim=512,
-            codebook_size=64,
-            codebook_dim=32,
-            commitment_weight=0.1,
-            decay=0.85,
-            heads=32,
-            kmeans_iters=20,
-            separate_codebook_per_head=True,
-            stochastic_sample_codes=True,
-            threshold_ema_dead_code=2,
-        )
-        self.out_feature_net = nn.Linear(512, hidden_channels)
-        self.encoder = attentions_onnx.Encoder(
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout,
-            gin_channels=self.gin_channels,
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    def forward(
-        self, x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, g=None
-    ):
-        x_mask = torch.ones_like(x).unsqueeze(0)
-        bert_emb = self.bert_proj(bert.transpose(0, 1).unsqueeze(0)).transpose(1, 2)
-        ja_bert_emb = self.ja_bert_proj(ja_bert.transpose(0, 1).unsqueeze(0)).transpose(
-            1, 2
-        )
-        en_bert_emb = self.en_bert_proj(en_bert.transpose(0, 1).unsqueeze(0)).transpose(
-            1, 2
-        )
-        emo_emb = self.in_feature_net(emo.transpose(0, 1))
-        emo_emb, _, _ = self.emo_vq(emo_emb.unsqueeze(1))
-        emo_emb = self.out_feature_net(emo_emb)
-        x = (
-            self.emb(x)
-            + self.tone_emb(tone)
-            + self.language_emb(language)
-            + bert_emb
-            + ja_bert_emb
-            + en_bert_emb
-            + emo_emb
-        ) * math.sqrt(
-            self.hidden_channels
-        )  # [b, t, h]
-        x = torch.transpose(x, 1, -1)  # [b, h, t]
-        x_mask = x_mask.to(x.dtype)
-        x = self.encoder(x * x_mask, x_mask, g=g)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        return x, m, logs, x_mask
-class ResidualCouplingBlock(nn.Module):
-    def __init__(
-        self,
-        channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        n_flows=4,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-        self.flows = nn.ModuleList()
-        for i in range(n_flows):
-            self.flows.append(
-                modules.ResidualCouplingLayer(
-                    channels,
-                    hidden_channels,
-                    kernel_size,
-                    dilation_rate,
-                    n_layers,
-                    gin_channels=gin_channels,
-                    mean_only=True,
-                )
-            )
-            self.flows.append(modules.Flip())
-    def forward(self, x, x_mask, g=None, reverse=True):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x
-class PosteriorEncoder(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        hidden_channels,
-        kernel_size,
-        dilation_rate,
-        n_layers,
-        gin_channels=0,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.gin_channels = gin_channels
-        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-        self.enc = modules.WN(
-            hidden_channels,
-            kernel_size,
-            dilation_rate,
-            n_layers,
-            gin_channels=gin_channels,
-        )
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    def forward(self, x, x_lengths, g=None):
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
-            x.dtype
-        )
-        x = self.pre(x) * x_mask
-        x = self.enc(x, x_mask, g=g)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
-        return z, m, logs, x_mask
-class Generator(torch.nn.Module):
-    def __init__(
-        self,
-        initial_channel,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        gin_channels=0,
-    ):
-        super(Generator, self).__init__()
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-        self.conv_pre = Conv1d(
-            initial_channel, upsample_initial_channel, 7, 1, padding=3
-        )
-        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(
-                weight_norm(
-                    ConvTranspose1d(
-                        upsample_initial_channel // (2**i),
-                        upsample_initial_channel // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(
-                zip(resblock_kernel_sizes, resblock_dilation_sizes)
-            ):
-                self.resblocks.append(resblock(ch, k, d))
-        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-        self.ups.apply(init_weights)
-        if gin_channels != 0:
-            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-    def forward(self, x, g=None):
-        x = self.conv_pre(x)
-        if g is not None:
-            x = x + self.cond(g)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            x = self.ups[i](x)
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-    def remove_weight_norm(self):
-        print("Removing weight norm...")
-        for layer in self.ups:
-            remove_weight_norm(layer)
-        for layer in self.resblocks:
-            layer.remove_weight_norm()
-class DiscriminatorP(torch.nn.Module):
-    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
-        self.period = period
-        self.use_spectral_norm = use_spectral_norm
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(
-                    Conv2d(
-                        1,
-                        32,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        32,
-                        128,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        128,
-                        512,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        512,
-                        1024,
-                        (kernel_size, 1),
-                        (stride, 1),
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-                norm_f(
-                    Conv2d(
-                        1024,
-                        1024,
-                        (kernel_size, 1),
-                        1,
-                        padding=(get_padding(kernel_size, 1), 0),
-                    )
-                ),
-            ]
-        )
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-    def forward(self, x):
-        fmap = []
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), "reflect")
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-class DiscriminatorS(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super(DiscriminatorS, self).__init__()
-        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
-        self.convs = nn.ModuleList(
-            [
-                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
-                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
-                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
-                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
-                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
-                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-            ]
-        )
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-    def forward(self, x):
-        fmap = []
-        for layer in self.convs:
-            x = layer(x)
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-class MultiPeriodDiscriminator(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super(MultiPeriodDiscriminator, self).__init__()
-        periods = [2, 3, 5, 7, 11]
-        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
-        discs = discs + [
-            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
-        ]
-        self.discriminators = nn.ModuleList(discs)
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            y_d_gs.append(y_d_g)
-            fmap_rs.append(fmap_r)
-            fmap_gs.append(fmap_g)
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-class ReferenceEncoder(nn.Module):
-    """
-    inputs --- [N, Ty/r, n_mels*r]  mels
-    outputs --- [N, ref_enc_gru_size]
-    """
-    def __init__(self, spec_channels, gin_channels=0):
-        super().__init__()
-        self.spec_channels = spec_channels
-        ref_enc_filters = [32, 32, 64, 64, 128, 128]
-        K = len(ref_enc_filters)
-        filters = [1] + ref_enc_filters
-        convs = [
-            weight_norm(
-                nn.Conv2d(
-                    in_channels=filters[i],
-                    out_channels=filters[i + 1],
-                    kernel_size=(3, 3),
-                    stride=(2, 2),
-                    padding=(1, 1),
-                )
-            )
-            for i in range(K)
-        ]
-        self.convs = nn.ModuleList(convs)
-        # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501
-        out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
-        self.gru = nn.GRU(
-            input_size=ref_enc_filters[-1] * out_channels,
-            hidden_size=256 // 2,
-            batch_first=True,
-        )
-        self.proj = nn.Linear(128, gin_channels)
-    def forward(self, inputs, mask=None):
-        N = inputs.size(0)
-        out = inputs.view(N, 1, -1, self.spec_channels)  # [N, 1, Ty, n_freqs]
-        for conv in self.convs:
-            out = conv(out)
-            # out = wn(out)
-            out = F.relu(out)  # [N, 128, Ty//2^K, n_mels//2^K]
-        out = out.transpose(1, 2)  # [N, Ty//2^K, 128, n_mels//2^K]
-        T = out.size(1)
-        N = out.size(0)
-        out = out.contiguous().view(N, T, -1)  # [N, Ty//2^K, 128*n_mels//2^K]
-        self.gru.flatten_parameters()
-        memory, out = self.gru(out)  # out --- [1, N, 128]
-        return self.proj(out.squeeze(0))
-    def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
-        for i in range(n_convs):
-            L = (L - kernel_size + 2 * pad) // stride + 1
-        return L
-class SynthesizerTrn(nn.Module):
-    """
-    Synthesizer for Training
-    """
-    def __init__(
-        self,
-        n_vocab,
-        spec_channels,
-        segment_size,
-        inter_channels,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        resblock,
-        resblock_kernel_sizes,
-        resblock_dilation_sizes,
-        upsample_rates,
-        upsample_initial_channel,
-        upsample_kernel_sizes,
-        n_speakers=256,
-        gin_channels=256,
-        use_sdp=True,
-        n_flow_layer=4,
-        n_layers_trans_flow=4,
-        flow_share_parameter=False,
-        use_transformer_flow=True,
-        **kwargs,
-    ):
-        super().__init__()
-        self.n_vocab = n_vocab
-        self.spec_channels = spec_channels
-        self.inter_channels = inter_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.resblock = resblock
-        self.resblock_kernel_sizes = resblock_kernel_sizes
-        self.resblock_dilation_sizes = resblock_dilation_sizes
-        self.upsample_rates = upsample_rates
-        self.upsample_initial_channel = upsample_initial_channel
-        self.upsample_kernel_sizes = upsample_kernel_sizes
-        self.segment_size = segment_size
-        self.n_speakers = n_speakers
-        self.gin_channels = gin_channels
-        self.n_layers_trans_flow = n_layers_trans_flow
-        self.use_spk_conditioned_encoder = kwargs.get(
-            "use_spk_conditioned_encoder", True
-        )
-        self.use_sdp = use_sdp
-        self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
-        self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
-        self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
-        self.current_mas_noise_scale = self.mas_noise_scale_initial
-        if self.use_spk_conditioned_encoder and gin_channels > 0:
-            self.enc_gin_channels = gin_channels
-        self.enc_p = TextEncoder(
-            n_vocab,
-            inter_channels,
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout,
-            self.n_speakers,
-            gin_channels=self.enc_gin_channels,
-        )
-        self.dec = Generator(
-            inter_channels,
-            resblock,
-            resblock_kernel_sizes,
-            resblock_dilation_sizes,
-            upsample_rates,
-            upsample_initial_channel,
-            upsample_kernel_sizes,
-            gin_channels=gin_channels,
-        )
-        self.enc_q = PosteriorEncoder(
-            spec_channels,
-            inter_channels,
-            hidden_channels,
-            5,
-            1,
-            16,
-            gin_channels=gin_channels,
-        )
-        if use_transformer_flow:
-            self.flow = TransformerCouplingBlock(
-                inter_channels,
-                hidden_channels,
-                filter_channels,
-                n_heads,
-                n_layers_trans_flow,
-                5,
-                p_dropout,
-                n_flow_layer,
-                gin_channels=gin_channels,
-                share_parameter=flow_share_parameter,
-            )
-        else:
-            self.flow = ResidualCouplingBlock(
-                inter_channels,
-                hidden_channels,
-                5,
-                1,
-                n_flow_layer,
-                gin_channels=gin_channels,
-            )
-        self.sdp = StochasticDurationPredictor(
-            hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
-        )
-        self.dp = DurationPredictor(
-            hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
-        )
-        if n_speakers >= 1:
-            self.emb_g = nn.Embedding(n_speakers, gin_channels)
-        else:
-            self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
-    def export_onnx(
-        self,
-        path,
-        max_len=None,
-        sdp_ratio=0,
-        y=None,
-    ):
-        noise_scale = 0.667
-        length_scale = 1
-        noise_scale_w = 0.8
-        x = (
-            torch.LongTensor(
-                [
-                    0,
-                    97,
-                    0,
-                    8,
-                    0,
-                    78,
-                    0,
-                    8,
-                    0,
-                    76,
-                    0,
-                    37,
-                    0,
-                    40,
-                    0,
-                    97,
-                    0,
-                    8,
-                    0,
-                    23,
-                    0,
-                    8,
-                    0,
-                    74,
-                    0,
-                    26,
-                    0,
-                    104,
-                    0,
-                ]
-            )
-            .unsqueeze(0)
-            .cpu()
-        )
-        tone = torch.zeros_like(x).cpu()
-        language = torch.zeros_like(x).cpu()
-        x_lengths = torch.LongTensor([x.shape[1]]).cpu()
-        sid = torch.LongTensor([0]).cpu()
-        bert = torch.randn(size=(x.shape[1], 1024)).cpu()
-        ja_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
-        en_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
-        if self.n_speakers > 0:
-            g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
-            torch.onnx.export(
-                self.emb_g,
-                (sid),
-                f"onnx/{path}/{path}_emb.onnx",
-                input_names=["sid"],
-                output_names=["g"],
-                verbose=True,
-            )
-        else:
-            g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
-        emo = torch.randn(512, 1)
-        torch.onnx.export(
-            self.enc_p,
-            (x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, g),
-            f"onnx/{path}/{path}_enc_p.onnx",
-            input_names=[
-                "x",
-                "x_lengths",
-                "t",
-                "language",
-                "bert_0",
-                "bert_1",
-                "bert_2",
-                "emo",
-                "g",
-            ],
-            output_names=["xout", "m_p", "logs_p", "x_mask"],
-            dynamic_axes={
-                "x": [0, 1],
-                "t": [0, 1],
-                "language": [0, 1],
-                "bert_0": [0],
-                "bert_1": [0],
-                "bert_2": [0],
-                "xout": [0, 2],
-                "m_p": [0, 2],
-                "logs_p": [0, 2],
-                "x_mask": [0, 2],
-            },
-            verbose=True,
-            opset_version=16,
-        )
-        x, m_p, logs_p, x_mask = self.enc_p(
-            x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, g
-        )
-        zinput = (
-            torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
-            * noise_scale_w
-        )
-        torch.onnx.export(
-            self.sdp,
-            (x, x_mask, zinput, g),
-            f"onnx/{path}/{path}_sdp.onnx",
-            input_names=["x", "x_mask", "zin", "g"],
-            output_names=["logw"],
-            dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "zin": [0, 2], "logw": [0, 2]},
-            verbose=True,
-        )
-        torch.onnx.export(
-            self.dp,
-            (x, x_mask, g),
-            f"onnx/{path}/{path}_dp.onnx",
-            input_names=["x", "x_mask", "g"],
-            output_names=["logw"],
-            dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "logw": [0, 2]},
-            verbose=True,
-        )
-        logw = self.sdp(x, x_mask, zinput, g=g) * (sdp_ratio) + self.dp(
-            x, x_mask, g=g
-        ) * (1 - sdp_ratio)
-        w = torch.exp(logw) * x_mask * length_scale
-        w_ceil = torch.ceil(w)
-        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
-        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
-            x_mask.dtype
-        )
-        attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
-        attn = commons.generate_path(w_ceil, attn_mask)
-        m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
-            1, 2
-        )  # [b, t', t], [b, t, d] -> [b, d, t']
-        logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
-            1, 2
-        )  # [b, t', t], [b, t, d] -> [b, d, t']
-        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
-        torch.onnx.export(
-            self.flow,
-            (z_p, y_mask, g),
-            f"onnx/{path}/{path}_flow.onnx",
-            input_names=["z_p", "y_mask", "g"],
-            output_names=["z"],
-            dynamic_axes={"z_p": [0, 2], "y_mask": [0, 2], "z": [0, 2]},
-            verbose=True,
-        )
-        z = self.flow(z_p, y_mask, g=g, reverse=True)
-        z_in = (z * y_mask)[:, :, :max_len]
-        torch.onnx.export(
-            self.dec,
-            (z_in, g),
-            f"onnx/{path}/{path}_dec.onnx",
-            input_names=["z_in", "g"],
-            output_names=["o"],
-            dynamic_axes={"z_in": [0, 2], "o": [0, 2]},
-            verbose=True,
-        )
-        o = self.dec((z * y_mask)[:, :, :max_len], g=g)

onnx_modules/V220/text/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .symbols import *

onnx_modules/V220/text/symbols.py DELETED Viewed

@@ -1,187 +0,0 @@
-punctuation = ["!", "?", "…", ",", ".", "'", "-"]
-pu_symbols = punctuation + ["SP", "UNK"]
-pad = "_"
-# chinese
-zh_symbols = [
-    "E",
-    "En",
-    "a",
-    "ai",
-    "an",
-    "ang",
-    "ao",
-    "b",
-    "c",
-    "ch",
-    "d",
-    "e",
-    "ei",
-    "en",
-    "eng",
-    "er",
-    "f",
-    "g",
-    "h",
-    "i",
-    "i0",
-    "ia",
-    "ian",
-    "iang",
-    "iao",
-    "ie",
-    "in",
-    "ing",
-    "iong",
-    "ir",
-    "iu",
-    "j",
-    "k",
-    "l",
-    "m",
-    "n",
-    "o",
-    "ong",
-    "ou",
-    "p",
-    "q",
-    "r",
-    "s",
-    "sh",
-    "t",
-    "u",
-    "ua",
-    "uai",
-    "uan",
-    "uang",
-    "ui",
-    "un",
-    "uo",
-    "v",
-    "van",
-    "ve",
-    "vn",
-    "w",
-    "x",
-    "y",
-    "z",
-    "zh",
-    "AA",
-    "EE",
-    "OO",
-]
-num_zh_tones = 6
-# japanese
-ja_symbols = [
-    "N",
-    "a",
-    "a:",
-    "b",
-    "by",
-    "ch",
-    "d",
-    "dy",
-    "e",
-    "e:",
-    "f",
-    "g",
-    "gy",
-    "h",
-    "hy",
-    "i",
-    "i:",
-    "j",
-    "k",
-    "ky",
-    "m",
-    "my",
-    "n",
-    "ny",
-    "o",
-    "o:",
-    "p",
-    "py",
-    "q",
-    "r",
-    "ry",
-    "s",
-    "sh",
-    "t",
-    "ts",
-    "ty",
-    "u",
-    "u:",
-    "w",
-    "y",
-    "z",
-    "zy",
-]
-num_ja_tones = 2
-# English
-en_symbols = [
-    "aa",
-    "ae",
-    "ah",
-    "ao",
-    "aw",
-    "ay",
-    "b",
-    "ch",
-    "d",
-    "dh",
-    "eh",
-    "er",
-    "ey",
-    "f",
-    "g",
-    "hh",
-    "ih",
-    "iy",
-    "jh",
-    "k",
-    "l",
-    "m",
-    "n",
-    "ng",
-    "ow",
-    "oy",
-    "p",
-    "r",
-    "s",
-    "sh",
-    "t",
-    "th",
-    "uh",
-    "uw",
-    "V",
-    "w",
-    "y",
-    "z",
-    "zh",
-]
-num_en_tones = 4
-# combine all symbols
-normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
-symbols = [pad] + normal_symbols + pu_symbols
-sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
-# combine all tones
-num_tones = num_zh_tones + num_ja_tones + num_en_tones
-# language maps
-language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
-num_languages = len(language_id_map.keys())
-language_tone_start_map = {
-    "ZH": 0,
-    "JP": num_zh_tones,
-    "EN": num_zh_tones + num_ja_tones,
-}
-if __name__ == "__main__":
-    a = set(zh_symbols)
-    b = set(en_symbols)
-    print(sorted(a & b))

onnx_modules/__init__.py DELETED Viewed

@@ -1,50 +0,0 @@
-from utils import get_hparams_from_file, load_checkpoint
-import json
-def export_onnx(export_path, model_path, config_path):
-    hps = get_hparams_from_file(config_path)
-    version = hps.version[0:3]
-    if version == "2.0":
-        from .V200 import SynthesizerTrn, symbols
-    elif version == "2.1":
-        from .V210 import SynthesizerTrn, symbols
-    elif version == "2.2":
-        from .V220 import SynthesizerTrn, symbols
-    net_g = SynthesizerTrn(
-        len(symbols),
-        hps.data.filter_length // 2 + 1,
-        hps.train.segment_size // hps.data.hop_length,
-        n_speakers=hps.data.n_speakers,
-        **hps.model,
-    )
-    _ = net_g.eval()
-    _ = load_checkpoint(model_path, net_g, None, skip_optimizer=True)
-    net_g.cpu()
-    net_g.export_onnx(export_path)
-    spklist = []
-    for key in hps.data.spk2id.keys():
-        spklist.append(key)
-    MoeVSConf = {
-        "Folder": f"{export_path}",
-        "Name": f"{export_path}",
-        "Type": "BertVits",
-        "Symbol": symbols,
-        "Cleaner": "",
-        "Rate": hps.data.sampling_rate,
-        "CharaMix": True,
-        "Characters": spklist,
-        "LanguageMap": {"ZH": [0, 0], "JP": [1, 6], "EN": [2, 8]},
-        "Dict": "BasicDict",
-        "BertPath": [
-            "chinese-roberta-wwm-ext-large",
-            "deberta-v2-large-japanese",
-            "bert-base-japanese-v3",
-        ],
-        "Clap": "clap-htsat-fused",
-    }
-    with open(f"onnx/{export_path}.json", "w") as MoeVsConfFile:
-        json.dump(MoeVSConf, MoeVsConfFile, indent=4)