Spaces:
Running
on
T4
Running
on
T4
File size: 1,436 Bytes
88490a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import torch
import torch.nn as nn
import torchaudio
class LogMelSpectrogram(nn.Module):
def __init__(self) -> None:
super().__init__()
self.melspectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=22050,
n_fft=4096,
hop_length=1024,
f_min=10.0,
n_mels=512,
)
def forward(self, x):
# x : audio(batch, sample)
# X : melspec (batch, freq, frame)
with torch.no_grad():
with torch.cuda.amp.autocast(enabled=False):
X = self.melspectrogram(x)
X = X.clamp(min=1e-6).log()
return X
class ConcatEmbeddingToMel(nn.Module):
def __init__(self, embedding_offset, n_vocab, n_dim) -> None:
super().__init__()
self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=n_dim)
self.embedding_offset = embedding_offset
def forward(self, feature, index_value):
"""
index_value : (batch, )
feature : (batch, time, feature_dim)
"""
index_shifted = index_value - self.embedding_offset
# (batch, 1, feature_dim)
composer_embedding = self.embedding(index_shifted).unsqueeze(1)
# print(composer_embedding.shape, feature.shape)
# (batch, 1 + time, feature_dim)
inputs_embeds = torch.cat([composer_embedding, feature], dim=1)
return inputs_embeds
|