Spaces:

Flux9665
/

MassivelyMultilingualTTS

Running on T4

App Files Files

MassivelyMultilingualTTS / Architectures /ToucanTTS /StochasticToucanTTS /StochasticVariancePredictor.py

Flux9665

use explicit code instead of relying on release download

9e275b8 9 months ago

raw

history blame

18.5 kB

	"""
	Code taken and adapted from https://github.com/jaywalnut310/vits

	MIT License

	Copyright (c) 2021 Jaehyeon Kim

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:

	The above copyright notice and this permission notice shall be included in all
	copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	SOFTWARE.
	"""

	import math

	import numpy as np
	import torch
	from torch import nn
	from torch.nn import functional as F

	DEFAULT_MIN_BIN_WIDTH = 1e-3
	DEFAULT_MIN_BIN_HEIGHT = 1e-3
	DEFAULT_MIN_DERIVATIVE = 1e-3


	class StochasticVariancePredictor(nn.Module):
	def __init__(self, in_channels, kernel_size, p_dropout, n_flows=4, conditioning_signal_channels=0):
	super().__init__()
	self.in_channels = in_channels
	self.filter_channels = in_channels
	self.kernel_size = kernel_size
	self.p_dropout = p_dropout
	self.n_flows = n_flows
	self.gin_channels = conditioning_signal_channels if conditioning_signal_channels is not None else 0

	self.log_flow = Log()
	self.flows = nn.ModuleList()
	self.flows.append(ElementwiseAffine(2))
	for i in range(n_flows):
	self.flows.append(ConvFlow(2, in_channels, kernel_size, n_layers=3))
	self.flows.append(Flip())

	self.post_pre = nn.Conv1d(1, in_channels, 1)
	self.post_proj = nn.Conv1d(in_channels, in_channels, 1)
	self.post_convs = DDSConv(in_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
	self.post_flows = nn.ModuleList()
	self.post_flows.append(ElementwiseAffine(2))
	for i in range(4):
	self.post_flows.append(ConvFlow(2, in_channels, kernel_size, n_layers=3))
	self.post_flows.append(Flip())

	self.pre = nn.Conv1d(in_channels, in_channels, 1)
	self.proj = nn.Conv1d(in_channels, in_channels, 1)
	self.convs = DDSConv(in_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
	if self.gin_channels != 0:
	self.cond = nn.Conv1d(self.gin_channels, in_channels, 1)

	def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=0.3):
	x = self.pre(x)
	if g is not None:
	g = torch.detach(g)
	x = x + self.cond(g)
	x = self.convs(x, x_mask)
	x = self.proj(x) * x_mask

	if not reverse:
	flows = self.flows
	assert w is not None

	logdet_tot_q = 0
	h_w = self.post_pre(w)
	h_w = self.post_convs(h_w, x_mask)
	h_w = self.post_proj(h_w) * x_mask
	e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
	z_q = e_q
	for flow in self.post_flows:
	z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
	logdet_tot_q += logdet_q
	z_u, z1 = torch.split(z_q, [1, 1], 1)
	u = torch.sigmoid(z_u) * x_mask
	z0 = (w - u) * x_mask
	logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
	logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2]) - logdet_tot_q

	logdet_tot = 0
	z0, logdet = self.log_flow(z0, x_mask)
	logdet_tot += logdet
	z = torch.cat([z0, z1], 1)
	for flow in flows:
	z, logdet = flow(z, x_mask, g=x, reverse=reverse)
	logdet_tot = logdet_tot + logdet
	nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2]) - logdet_tot
	return nll + logq # [b]
	else:
	flows = list(reversed(self.flows))
	flows = flows[:-2] + [flows[-1]] # remove a useless vflow
	z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
	# noise scale 0.8 derived from coqui implementation, but dropped to 0.3 during testing. Might not be ideal yet.
	for flow in flows:
	z = flow(z, x_mask, g=x, reverse=reverse)
	z0, z1 = torch.split(z, [1, 1], 1)
	logw = z0
	return logw


	class Log(nn.Module):
	def forward(self, x, x_mask, reverse=False, **kwargs):
	if not reverse:
	y = torch.log(torch.clamp_min(x, 1e-6)) * x_mask
	logdet = torch.sum(-y, [1, 2])
	return y, logdet
	else:
	x = torch.exp(x) * x_mask
	return x


	class Flip(nn.Module):
	def forward(self, x, args, reverse=False, *kwargs):
	x = torch.flip(x, [1])
	if not reverse:
	logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
	return x, logdet
	else:
	return x


	class DDSConv(nn.Module):
	"""
	Dialted and Depth-Separable Convolution
	"""

	def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
	super().__init__()
	self.channels = channels
	self.kernel_size = kernel_size
	self.n_layers = n_layers
	self.p_dropout = p_dropout

	self.drop = nn.Dropout(p_dropout)
	self.convs_sep = nn.ModuleList()
	self.convs_1x1 = nn.ModuleList()
	self.norms_1 = nn.ModuleList()
	self.norms_2 = nn.ModuleList()
	for i in range(n_layers):
	dilation = kernel_size ** i
	padding = (kernel_size * dilation - dilation) // 2
	self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
	groups=channels, dilation=dilation, padding=padding
	))
	self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
	self.norms_1.append(LayerNorm(channels))
	self.norms_2.append(LayerNorm(channels))

	def forward(self, x, x_mask, g=None):
	if g is not None:
	x = x + g
	for i in range(self.n_layers):
	y = self.convs_sep[i](x * x_mask)
	y = self.norms_1[i](y)
	y = F.gelu(y)
	y = self.convs_1x1[i](y)
	y = self.norms_2[i](y)
	y = F.gelu(y)
	y = self.drop(y)
	x = x + y
	return x * x_mask


	class ConvFlow(nn.Module):
	def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
	super().__init__()
	self.in_channels = in_channels
	self.filter_channels = filter_channels
	self.kernel_size = kernel_size
	self.n_layers = n_layers
	self.num_bins = num_bins
	self.tail_bound = tail_bound
	self.half_channels = in_channels // 2

	self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
	self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
	self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
	self.proj.weight.data.zero_()
	self.proj.bias.data.zero_()

	def forward(self, x, x_mask, g=None, reverse=False):
	x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
	h = self.pre(x0)
	h = self.convs(h, x_mask, g=g)
	h = self.proj(h) * x_mask

	b, c, t = x0.shape
	h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]

	unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
	unnormalized_heights = h[..., self.num_bins:2 * self.num_bins] / math.sqrt(self.filter_channels)
	unnormalized_derivatives = h[..., 2 * self.num_bins:]

	x1, logabsdet = piecewise_rational_quadratic_transform(x1,
	unnormalized_widths,
	unnormalized_heights,
	unnormalized_derivatives,
	inverse=reverse,
	tails='linear',
	tail_bound=self.tail_bound
	)

	x = torch.cat([x0, x1], 1) * x_mask
	logdet = torch.sum(logabsdet * x_mask, [1, 2])
	if not reverse:
	return x, logdet
	else:
	return x


	class ElementwiseAffine(nn.Module):
	def __init__(self, channels):
	super().__init__()
	self.channels = channels
	self.m = nn.Parameter(torch.zeros(channels, 1))
	self.logs = nn.Parameter(torch.zeros(channels, 1))

	def forward(self, x, x_mask, reverse=False, **kwargs):
	if not reverse:
	y = self.m + torch.exp(self.logs) * x
	y = y * x_mask
	logdet = torch.sum(self.logs * x_mask, [1, 2])
	return y, logdet
	else:
	x = (x - self.m) * torch.exp(-self.logs) * x_mask
	return x


	class LayerNorm(nn.Module):
	def __init__(self, channels, eps=1e-5):
	super().__init__()
	self.channels = channels
	self.eps = eps

	self.gamma = nn.Parameter(torch.ones(channels))
	self.beta = nn.Parameter(torch.zeros(channels))

	def forward(self, x):
	x = x.transpose(1, -1)
	x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
	return x.transpose(1, -1)


	def piecewise_rational_quadratic_transform(inputs,
	unnormalized_widths,
	unnormalized_heights,
	unnormalized_derivatives,
	inverse=False,
	tails=None,
	tail_bound=1.,
	min_bin_width=DEFAULT_MIN_BIN_WIDTH,
	min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
	min_derivative=DEFAULT_MIN_DERIVATIVE):
	if tails is None:
	spline_fn = rational_quadratic_spline
	spline_kwargs = {}
	else:
	spline_fn = unconstrained_rational_quadratic_spline
	spline_kwargs = {
	'tails' : tails,
	'tail_bound': tail_bound
	}

	outputs, logabsdet = spline_fn(
	inputs=inputs,
	unnormalized_widths=unnormalized_widths,
	unnormalized_heights=unnormalized_heights,
	unnormalized_derivatives=unnormalized_derivatives,
	inverse=inverse,
	min_bin_width=min_bin_width,
	min_bin_height=min_bin_height,
	min_derivative=min_derivative,
	**spline_kwargs
	)
	return outputs, logabsdet


	def rational_quadratic_spline(inputs,
	unnormalized_widths,
	unnormalized_heights,
	unnormalized_derivatives,
	inverse=False,
	left=0., right=1., bottom=0., top=1.,
	min_bin_width=DEFAULT_MIN_BIN_WIDTH,
	min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
	min_derivative=DEFAULT_MIN_DERIVATIVE):
	if torch.min(inputs) < left or torch.max(inputs) > right:
	raise ValueError('Input to a transform is not within its domain')

	num_bins = unnormalized_widths.shape[-1]

	if min_bin_width * num_bins > 1.0:
	raise ValueError('Minimal bin width too large for the number of bins')
	if min_bin_height * num_bins > 1.0:
	raise ValueError('Minimal bin height too large for the number of bins')

	widths = F.softmax(unnormalized_widths, dim=-1)
	widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
	cumwidths = torch.cumsum(widths, dim=-1)
	cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
	cumwidths = (right - left) * cumwidths + left
	cumwidths[..., 0] = left
	cumwidths[..., -1] = right
	widths = cumwidths[..., 1:] - cumwidths[..., :-1]

	derivatives = min_derivative + F.softplus(unnormalized_derivatives)

	heights = F.softmax(unnormalized_heights, dim=-1)
	heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
	cumheights = torch.cumsum(heights, dim=-1)
	cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
	cumheights = (top - bottom) * cumheights + bottom
	cumheights[..., 0] = bottom
	cumheights[..., -1] = top
	heights = cumheights[..., 1:] - cumheights[..., :-1]

	if inverse:
	bin_idx = searchsorted(cumheights, inputs)[..., None]
	else:
	bin_idx = searchsorted(cumwidths, inputs)[..., None]

	input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
	input_bin_widths = widths.gather(-1, bin_idx)[..., 0]

	input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
	delta = heights / widths
	input_delta = delta.gather(-1, bin_idx)[..., 0]

	input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
	input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]

	input_heights = heights.gather(-1, bin_idx)[..., 0]

	if inverse:
	a = (((inputs - input_cumheights) * (input_derivatives
	+ input_derivatives_plus_one
	- 2 * input_delta)
	+ input_heights * (input_delta - input_derivatives)))
	b = (input_heights * input_derivatives
	- (inputs - input_cumheights) * (input_derivatives
	+ input_derivatives_plus_one
	- 2 * input_delta))
	c = - input_delta * (inputs - input_cumheights)

	discriminant = b.pow(2) - 4 * a * c
	assert (discriminant >= 0).all()

	root = (2 * c) / (-b - torch.sqrt(discriminant))
	outputs = root * input_bin_widths + input_cumwidths

	theta_one_minus_theta = root * (1 - root)
	denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
	* theta_one_minus_theta)
	derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
	+ 2 * input_delta * theta_one_minus_theta
	+ input_derivatives * (1 - root).pow(2))
	logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)

	return outputs, -logabsdet
	else:
	theta = (inputs - input_cumwidths) / input_bin_widths
	theta_one_minus_theta = theta * (1 - theta)

	numerator = input_heights * (input_delta * theta.pow(2)
	+ input_derivatives * theta_one_minus_theta)
	denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
	* theta_one_minus_theta)
	outputs = input_cumheights + numerator / denominator

	derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
	+ 2 * input_delta * theta_one_minus_theta
	+ input_derivatives * (1 - theta).pow(2))
	logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)

	return outputs, logabsdet


	def searchsorted(bin_locations, inputs, eps=1e-6):
	bin_locations[..., -1] += eps
	return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1


	def unconstrained_rational_quadratic_spline(inputs,
	unnormalized_widths,
	unnormalized_heights,
	unnormalized_derivatives,
	inverse=False,
	tails='linear',
	tail_bound=1.,
	min_bin_width=DEFAULT_MIN_BIN_WIDTH,
	min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
	min_derivative=DEFAULT_MIN_DERIVATIVE):
	inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
	outside_interval_mask = ~inside_interval_mask

	outputs = torch.zeros_like(inputs)
	logabsdet = torch.zeros_like(inputs)

	if tails == 'linear':
	unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
	constant = np.log(np.exp(1 - min_derivative) - 1)
	unnormalized_derivatives[..., 0] = constant
	unnormalized_derivatives[..., -1] = constant

	outputs[outside_interval_mask] = inputs[outside_interval_mask]
	logabsdet[outside_interval_mask] = 0
	else:
	raise RuntimeError('{} tails are not implemented.'.format(tails))

	outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
	inputs=inputs[inside_interval_mask],
	unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
	unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
	unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
	inverse=inverse,
	left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
	min_bin_width=min_bin_width,
	min_bin_height=min_bin_height,
	min_derivative=min_derivative
	)

	return outputs, logabsdet