Spaces:
Paused
Paused
# -------------------------------------------------------- | |
# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058) | |
# Github source: https://github.com/microsoft/unilm/tree/master/beats | |
# Copyright (c) 2022 Microsoft | |
# Licensed under The MIT License [see LICENSE for details] | |
# Based on fairseq code bases | |
# https://github.com/pytorch/fairseq | |
# -------------------------------------------------------- | |
import math | |
import warnings | |
import torch | |
from torch import Tensor, nn | |
import torch.nn.functional as F | |
class GradMultiply(torch.autograd.Function): | |
def forward(ctx, x, scale): | |
ctx.scale = scale | |
res = x.new(x) | |
return res | |
def backward(ctx, grad): | |
return grad * ctx.scale, None | |
class SamePad(nn.Module): | |
def __init__(self, kernel_size, causal=False): | |
super().__init__() | |
if causal: | |
self.remove = kernel_size - 1 | |
else: | |
self.remove = 1 if kernel_size % 2 == 0 else 0 | |
def forward(self, x): | |
if self.remove > 0: | |
x = x[:, :, : -self.remove] | |
return x | |
class Swish(nn.Module): | |
def __init__(self): | |
super(Swish, self).__init__() | |
self.act = torch.nn.Sigmoid() | |
def forward(self, x): | |
return x * self.act(x) | |
class GLU_Linear(nn.Module): | |
def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True): | |
super(GLU_Linear, self).__init__() | |
self.glu_type = glu_type | |
self.output_dim = output_dim | |
if glu_type == "sigmoid": | |
self.glu_act = torch.nn.Sigmoid() | |
elif glu_type == "swish": | |
self.glu_act = Swish() | |
elif glu_type == "relu": | |
self.glu_act = torch.nn.ReLU() | |
elif glu_type == "gelu": | |
self.glu_act = torch.nn.GELU() | |
if bias_in_glu: | |
self.linear = nn.Linear(input_dim, output_dim * 2, True) | |
else: | |
self.linear = nn.Linear(input_dim, output_dim * 2, False) | |
def forward(self, x): | |
# to be consistent with GLU_Linear, we assume the input always has the #channel (#dim) in the last dimension of the tensor, so need to switch the dimension first for 1D-Conv case | |
x = self.linear(x) | |
if self.glu_type == "bilinear": | |
x = (x[:, :, 0:self.output_dim] * x[:, :, self.output_dim:self.output_dim * 2]) | |
else: | |
x = (x[:, :, 0:self.output_dim] * self.glu_act(x[:, :, self.output_dim:self.output_dim * 2])) | |
return x | |
def gelu_accurate(x): | |
if not hasattr(gelu_accurate, "_a"): | |
gelu_accurate._a = math.sqrt(2 / math.pi) | |
return ( | |
0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3)))) | |
) | |
def gelu(x: torch.Tensor) -> torch.Tensor: | |
return torch.nn.functional.gelu(x.float()).type_as(x) | |
def get_activation_fn(activation: str): | |
"""Returns the activation function corresponding to `activation`""" | |
if activation == "relu": | |
return F.relu | |
elif activation == "gelu": | |
return gelu | |
elif activation == "gelu_fast": | |
warnings.warn( | |
"--activation-fn=gelu_fast has been renamed to gelu_accurate" | |
) | |
return gelu_accurate | |
elif activation == "gelu_accurate": | |
return gelu_accurate | |
elif activation == "tanh": | |
return torch.tanh | |
elif activation == "linear": | |
return lambda x: x | |
elif activation == "glu": | |
return lambda x: x | |
else: | |
raise RuntimeError("--activation-fn {} not supported".format(activation)) | |
def quant_noise(module, p, block_size): | |
""" | |
Wraps modules and applies quantization noise to the weights for | |
subsequent quantization with Iterative Product Quantization as | |
described in "Training with Quantization Noise for Extreme Model Compression" | |
Args: | |
- module: nn.Module | |
- p: amount of Quantization Noise | |
- block_size: size of the blocks for subsequent quantization with iPQ | |
Remarks: | |
- Module weights must have the right sizes wrt the block size | |
- Only Linear, Embedding and Conv2d modules are supported for the moment | |
- For more detail on how to quantize by blocks with convolutional weights, | |
see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks" | |
- We implement the simplest form of noise here as stated in the paper | |
which consists in randomly dropping blocks | |
""" | |
# if no quantization noise, don't register hook | |
if p <= 0: | |
return module | |
# supported modules | |
assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)) | |
# test whether module.weight has the right sizes wrt block_size | |
is_conv = module.weight.ndim == 4 | |
# 2D matrix | |
if not is_conv: | |
assert ( | |
module.weight.size(1) % block_size == 0 | |
), "Input features must be a multiple of block sizes" | |
# 4D matrix | |
else: | |
# 1x1 convolutions | |
if module.kernel_size == (1, 1): | |
assert ( | |
module.in_channels % block_size == 0 | |
), "Input channels must be a multiple of block sizes" | |
# regular convolutions | |
else: | |
k = module.kernel_size[0] * module.kernel_size[1] | |
assert k % block_size == 0, "Kernel size must be a multiple of block size" | |
def _forward_pre_hook(mod, input): | |
# no noise for evaluation | |
if mod.training: | |
if not is_conv: | |
# gather weight and sizes | |
weight = mod.weight | |
in_features = weight.size(1) | |
out_features = weight.size(0) | |
# split weight matrix into blocks and randomly drop selected blocks | |
mask = torch.zeros( | |
in_features // block_size * out_features, device=weight.device | |
) | |
mask.bernoulli_(p) | |
mask = mask.repeat_interleave(block_size, -1).view(-1, in_features) | |
else: | |
# gather weight and sizes | |
weight = mod.weight | |
in_channels = mod.in_channels | |
out_channels = mod.out_channels | |
# split weight matrix into blocks and randomly drop selected blocks | |
if mod.kernel_size == (1, 1): | |
mask = torch.zeros( | |
int(in_channels // block_size * out_channels), | |
device=weight.device, | |
) | |
mask.bernoulli_(p) | |
mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels) | |
else: | |
mask = torch.zeros( | |
weight.size(0), weight.size(1), device=weight.device | |
) | |
mask.bernoulli_(p) | |
mask = ( | |
mask.unsqueeze(2) | |
.unsqueeze(3) | |
.repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]) | |
) | |
# scale weights and apply mask | |
mask = mask.to( | |
torch.bool | |
) # x.bool() is not currently supported in TorchScript | |
s = 1 / (1 - p) | |
mod.weight.data = s * weight.masked_fill(mask, 0) | |
module.register_forward_pre_hook(_forward_pre_hook) | |
return module | |