Spaces:

Flux9665
/

MassivelyMultilingualTTS

Running on T4

App Files Files

MassivelyMultilingualTTS / Architectures /Vocoder /SAN_modules.py

Flux9665

use explicit code instead of relying on release download

9e275b8 about 1 year ago

raw

history blame

6.14 kB

	"""
	Code is from https://github.com/sony/bigvsan/blob/main/san_modules.py

	Paper: Shibuya, T., Takida, Y., Mitsufuji, Y., "BigVSAN: Enhancing GAN-based Neural Vocoders with Slicing Adversarial Network," Preprint.
	https://arxiv.org/pdf/2309.02836.pdf
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	def _normalize(tensor, dim):
	denom = tensor.norm(p=2.0, dim=dim, keepdim=True).clamp_min(1e-12)
	return tensor / denom


	class SANConv1d(nn.Conv1d):

	def __init__(self,
	in_channels,
	out_channels,
	kernel_size,
	stride=1,
	padding=0,
	dilation=1,
	bias=True,
	padding_mode='zeros',
	device=None,
	dtype=None
	):
	super(SANConv1d, self).__init__(
	in_channels, out_channels, kernel_size, stride, padding=padding, dilation=dilation,
	groups=1, bias=bias, padding_mode=padding_mode, device=device, dtype=dtype)
	scale = self.weight.norm(p=2.0, dim=[1, 2], keepdim=True).clamp_min(1e-12)
	self.weight = nn.parameter.Parameter(self.weight / scale.expand_as(self.weight))
	self.scale = nn.parameter.Parameter(scale.view(out_channels))
	if bias:
	self.bias = nn.parameter.Parameter(torch.zeros(in_channels, device=device, dtype=dtype))
	else:
	self.register_parameter('bias', None)
	self.normalize_weight()

	def forward(self, input, flg_train=False):
	if self.bias is not None:
	input = input + self.bias.view(self.in_channels, 1)
	normalized_weight = self._get_normalized_weight()
	scale = self.scale.view(self.out_channels, 1)
	if flg_train:
	out_fun = F.conv1d(input, normalized_weight.detach(), None, self.stride,
	self.padding, self.dilation, self.groups)
	out_dir = F.conv1d(input.detach(), normalized_weight, None, self.stride,
	self.padding, self.dilation, self.groups)
	out = [out_fun * scale, out_dir * scale.detach()]
	else:
	out = F.conv1d(input, normalized_weight, None, self.stride,
	self.padding, self.dilation, self.groups)
	out = out * scale
	return out

	@torch.no_grad()
	def normalize_weight(self):
	self.weight.data = self._get_normalized_weight()

	def _get_normalized_weight(self):
	return _normalize(self.weight, dim=[1, 2])


	class SANConv2d(nn.Conv2d):

	def __init__(self,
	in_channels,
	out_channels,
	kernel_size,
	stride=1,
	padding=0,
	dilation=1,
	bias=True,
	padding_mode='zeros',
	device=None,
	dtype=None
	):
	super(SANConv2d, self).__init__(
	in_channels, out_channels, kernel_size, stride, padding=padding, dilation=dilation,
	groups=1, bias=bias, padding_mode=padding_mode, device=device, dtype=dtype)
	scale = self.weight.norm(p=2.0, dim=[1, 2, 3], keepdim=True).clamp_min(1e-12)
	self.weight = nn.parameter.Parameter(self.weight / scale.expand_as(self.weight))
	self.scale = nn.parameter.Parameter(scale.view(out_channels))
	if bias:
	self.bias = nn.parameter.Parameter(torch.zeros(in_channels, device=device, dtype=dtype))
	else:
	self.register_parameter('bias', None)
	self.normalize_weight()

	def forward(self, input, flg_train=False):
	if self.bias is not None:
	input = input + self.bias.view(self.in_channels, 1, 1)
	normalized_weight = self._get_normalized_weight()
	scale = self.scale.view(self.out_channels, 1, 1)
	if flg_train:
	out_fun = F.conv2d(input, normalized_weight.detach(), None, self.stride,
	self.padding, self.dilation, self.groups)
	out_dir = F.conv2d(input.detach(), normalized_weight, None, self.stride,
	self.padding, self.dilation, self.groups)
	out = [out_fun * scale, out_dir * scale.detach()]
	else:
	out = F.conv2d(input, normalized_weight, None, self.stride,
	self.padding, self.dilation, self.groups)
	out = out * scale
	return out

	@torch.no_grad()
	def normalize_weight(self):
	self.weight.data = self._get_normalized_weight()

	def _get_normalized_weight(self):
	return _normalize(self.weight, dim=[1, 2, 3])


	class SANEmbedding(nn.Embedding):

	def __init__(self, num_embeddings, embedding_dim,
	scale_grad_by_freq=False,
	sparse=False, _weight=None,
	device=None, dtype=None):
	super(SANEmbedding, self).__init__(
	num_embeddings, embedding_dim, padding_idx=None,
	max_norm=None, norm_type=2., scale_grad_by_freq=scale_grad_by_freq,
	sparse=sparse, _weight=_weight,
	device=device, dtype=dtype)
	scale = self.weight.norm(p=2.0, dim=1, keepdim=True).clamp_min(1e-12)
	self.weight = nn.parameter.Parameter(self.weight / scale.expand_as(self.weight))
	self.scale = nn.parameter.Parameter(scale)

	def forward(self, input, flg_train=False):
	out = F.embedding(
	input, self.weight, self.padding_idx, self.max_norm,
	self.norm_type, self.scale_grad_by_freq, self.sparse)
	out = _normalize(out, dim=-1)
	scale = F.embedding(
	input, self.scale, self.padding_idx, self.max_norm,
	self.norm_type, self.scale_grad_by_freq, self.sparse)
	if flg_train:
	out_fun = out.detach()
	out_dir = out
	out = [out_fun * scale, out_dir * scale.detach()]
	else:
	out = out * scale
	return out

	@torch.no_grad()
	def normalize_weight(self):
	self.weight.data = _normalize(self.weight, dim=1)