Spaces:
Runtime error
Runtime error
File size: 5,296 Bytes
8a3fb2e e66133f d533c9c f29faf1 c17b696 e66133f c17b696 f29faf1 c17b696 f29faf1 c17b696 f29faf1 43ebb3b c17b696 e66133f c17b696 f30235e c17b696 96e542f e66133f c17b696 e66133f 43ebb3b f29faf1 43ebb3b c17b696 e66133f 43ebb3b e66133f 43ebb3b fdc373f 43ebb3b f29faf1 e66133f c17b696 e66133f c17b696 f29faf1 2561128 e66133f f34a81b 43ebb3b f34a81b f29faf1 f34a81b e66133f 43ebb3b e66133f ea68dfd 43ebb3b fdc373f f29faf1 f34a81b e66133f f29faf1 2561128 f29faf1 2561128 f29faf1 2561128 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
from typing import Iterable, Tuple
import numpy as np
import torch
from librosa.beat import beat_track
from PIL import Image
from tqdm.auto import tqdm
# from diffusers import AudioDiffusionPipeline
from .pipeline_audio_diffusion import AudioDiffusionPipeline
VERSION = "1.4.0"
class AudioDiffusion:
def __init__(
self,
model_id: str = "teticio/audio-diffusion-256",
cuda: bool = torch.cuda.is_available(),
progress_bar: Iterable = tqdm,
):
"""Class for generating audio using De-noising Diffusion Probabilistic Models.
Args:
model_id (String): name of model (local directory or Hugging Face Hub)
cuda (bool): use CUDA?
progress_bar (iterable): iterable callback for progress updates or None
"""
self.model_id = model_id
self.pipe = AudioDiffusionPipeline.from_pretrained(self.model_id)
if cuda:
self.pipe.to("cuda")
self.progress_bar = progress_bar or (lambda _: _)
def generate_spectrogram_and_audio(
self,
steps: int = None,
generator: torch.Generator = None,
step_generator: torch.Generator = None,
eta: float = 0,
noise: torch.Tensor = None,
encoding: torch.Tensor = None,
) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
"""Generate random mel spectrogram and convert to audio.
Args:
steps (int): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM)
generator (torch.Generator): random number generator or None
step_generator (torch.Generator): random number generator used to de-noise or None
eta (float): parameter between 0 and 1 used with DDIM scheduler
noise (torch.Tensor): noisy image or None
encoding (`torch.Tensor`): for UNet2DConditionModel shape (batch_size, seq_length, cross_attention_dim)
Returns:
PIL Image: mel spectrogram
(float, np.ndarray): sample rate and raw audio
"""
images, (sample_rate, audios) = self.pipe(
batch_size=1,
steps=steps,
generator=generator,
step_generator=step_generator,
eta=eta,
noise=noise,
encoding=encoding,
return_dict=False,
)
return images[0], (sample_rate, audios[0])
def generate_spectrogram_and_audio_from_audio(
self,
audio_file: str = None,
raw_audio: np.ndarray = None,
slice: int = 0,
start_step: int = 0,
steps: int = None,
generator: torch.Generator = None,
mask_start_secs: float = 0,
mask_end_secs: float = 0,
step_generator: torch.Generator = None,
eta: float = 0,
encoding: torch.Tensor = None,
noise: torch.Tensor = None,
) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
"""Generate random mel spectrogram from audio input and convert to audio.
Args:
audio_file (str): must be a file on disk due to Librosa limitation or
raw_audio (np.ndarray): audio as numpy array
slice (int): slice number of audio to convert
start_step (int): step to start from
steps (int): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM)
generator (torch.Generator): random number generator or None
mask_start_secs (float): number of seconds of audio to mask (not generate) at start
mask_end_secs (float): number of seconds of audio to mask (not generate) at end
step_generator (torch.Generator): random number generator used to de-noise or None
eta (float): parameter between 0 and 1 used with DDIM scheduler
encoding (`torch.Tensor`): for UNet2DConditionModel shape (batch_size, seq_length, cross_attention_dim)
noise (torch.Tensor): noisy image or None
Returns:
PIL Image: mel spectrogram
(float, np.ndarray): sample rate and raw audio
"""
images, (sample_rate, audios) = self.pipe(
batch_size=1,
audio_file=audio_file,
raw_audio=raw_audio,
slice=slice,
start_step=start_step,
steps=steps,
generator=generator,
mask_start_secs=mask_start_secs,
mask_end_secs=mask_end_secs,
step_generator=step_generator,
eta=eta,
noise=noise,
encoding=encoding,
return_dict=False,
)
return images[0], (sample_rate, audios[0])
@staticmethod
def loop_it(audio: np.ndarray, sample_rate: int, loops: int = 12) -> np.ndarray:
"""Loop audio
Args:
audio (np.ndarray): audio as numpy array
sample_rate (int): sample rate of audio
loops (int): number of times to loop
Returns:
(float, np.ndarray): sample rate and raw audio or None
"""
_, beats = beat_track(y=audio, sr=sample_rate, units="samples")
beats_in_bar = (len(beats) - 1) // 4 * 4
if beats_in_bar > 0:
return np.tile(audio[beats[0] : beats[beats_in_bar]], loops)
return None
|