teticio commited on
Commit
2561128
1 Parent(s): 21c77d0

various fixes

Browse files
README.md CHANGED
@@ -18,7 +18,7 @@ license: gpl-3.0
18
  **UPDATES**:
19
 
20
  15/10/2022
21
- Added latent audio diffusion (see below).
22
 
23
  4/10/2022
24
  It is now possible to mask parts of the input audio during generation which means you can stitch several samples together (think "out-painting").
@@ -138,5 +138,6 @@ python scripts/train_vae.py \
138
  #### Train latent diffusion model.
139
  ```bash
140
  accelerate launch ...
141
- --vae models/autoencoder-kl
 
142
  ```
 
18
  **UPDATES**:
19
 
20
  15/10/2022
21
+ Added latent audio diffusion (see below). Also added the possibility to train a model to use DDIM ([Denoising Diffusion Implicit Models](https://arxiv.org/pdf/2010.02502.pdf)) by setting `--scheduler ddim`. These have the benefit that samples can be generated with much fewer steps (~50) than used in training.
22
 
23
  4/10/2022
24
  It is now possible to mask parts of the input audio during generation which means you can stitch several samples together (think "out-painting").
 
138
  #### Train latent diffusion model.
139
  ```bash
140
  accelerate launch ...
141
+ --vae models/autoencoder-kl
142
+ --latent_resoultion 32
143
  ```
audiodiffusion/__init__.py CHANGED
@@ -1,15 +1,16 @@
1
- from typing import Iterable, Tuple
2
 
3
  import torch
4
  import numpy as np
5
  from PIL import Image
6
  from tqdm.auto import tqdm
7
  from librosa.beat import beat_track
8
- from diffusers import DiffusionPipeline
 
9
 
10
  from .mel import Mel
11
 
12
- VERSION = "1.1.5"
13
 
14
 
15
  class AudioDiffusion:
@@ -42,7 +43,11 @@ class AudioDiffusion:
42
  hop_length=hop_length,
43
  top_db=top_db)
44
  self.model_id = model_id
45
- self.pipe = DiffusionPipeline.from_pretrained(self.model_id)
 
 
 
 
46
  if cuda:
47
  self.pipe.to("cuda")
48
  self.progress_bar = progress_bar or (lambda _: _)
@@ -55,20 +60,18 @@ class AudioDiffusion:
55
  """Generate random mel spectrogram and convert to audio.
56
 
57
  Args:
 
58
  generator (torch.Generator): random number generator or None
59
 
60
  Returns:
61
  PIL Image: mel spectrogram
62
  (float, np.ndarray): sample rate and raw audio
63
  """
64
- images = self.pipe(output_type="numpy",
65
- generator=generator,
66
- num_inference_steps=self.pipe.scheduler.
67
- num_train_timesteps)["sample"]
68
- images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
69
- image = Image.fromarray(images[0][0])
70
- audio = self.mel.image_to_audio(image)
71
- return image, (self.mel.get_sample_rate(), audio)
72
 
73
  @torch.no_grad()
74
  def generate_spectrogram_and_audio_from_audio(
@@ -99,51 +102,124 @@ class AudioDiffusion:
99
  (float, np.ndarray): sample rate and raw audio
100
  """
101
 
102
- # It would be better to derive a class from DiffusionPipeline
103
- # but currently the return type ImagePipelineOutput cannot be imported
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  if steps is None:
105
- steps = self.pipe.scheduler.num_train_timesteps
106
  # Unfortunately, the schedule is set up in the constructor
107
- scheduler = self.pipe.scheduler.__class__(num_train_timesteps=steps)
108
  scheduler.set_timesteps(steps)
109
  mask = None
110
  images = noise = torch.randn(
111
- (1, self.pipe.unet.in_channels, self.pipe.unet.sample_size,
112
- self.pipe.unet.sample_size),
113
  generator=generator)
114
 
115
  if audio_file is not None or raw_audio is not None:
116
- self.mel.load_audio(audio_file, raw_audio)
117
- input_image = self.mel.audio_slice_to_image(slice)
118
  input_image = np.frombuffer(input_image.tobytes(),
119
  dtype="uint8").reshape(
120
  (input_image.height,
121
  input_image.width))
122
  input_image = ((input_image / 255) * 2 - 1)
 
123
 
124
- if hasattr(self.pipe, 'vqvae'):
125
- input_image = self.pipe.vqvae.encode(
126
- input_image).latent_dist.sample(generator=generator)
127
- input_image = 0.18215 * input_image
128
 
129
  if start_step > 0:
130
  images[0, 0] = scheduler.add_noise(
131
- torch.tensor(input_image[np.newaxis, np.newaxis, :]),
132
  noise, torch.tensor(steps - start_step))
133
 
134
- pixels_per_second = (self.mel.get_sample_rate() *
135
- self.pipe.unet.sample_size /
136
- self.mel.hop_length / self.mel.x_res)
137
  mask_start = int(mask_start_secs * pixels_per_second)
138
  mask_end = int(mask_end_secs * pixels_per_second)
139
  mask = scheduler.add_noise(
140
- torch.tensor(input_image[np.newaxis, np.newaxis, :]), noise,
141
  torch.tensor(scheduler.timesteps[start_step:]))
142
 
143
- images = images.to(self.pipe.device)
144
  for step, t in enumerate(
145
  self.progress_bar(scheduler.timesteps[start_step:])):
146
- model_output = self.pipe.unet(images, t)['sample']
147
  images = scheduler.step(model_output,
148
  t,
149
  images,
@@ -151,45 +227,36 @@ class AudioDiffusion:
151
 
152
  if mask is not None:
153
  if mask_start > 0:
154
- images[0, 0, :, :mask_start] = mask[step,
155
- 0, :, :mask_start]
156
  if mask_end > 0:
157
- images[0, 0, :, -mask_end:] = mask[step, 0, :, -mask_end:]
 
158
 
159
- if hasattr(self.pipe, 'vqvae'):
160
  # 0.18215 was scaling factor used in training to ensure unit variance
161
- # This is also currently hardcoded in diffusers pipeline
162
  images = 1 / 0.18215 * images
163
- images = self.pipe.vqvae.decode(images)['sample']
164
 
165
  images = (images / 2 + 0.5).clamp(0, 1)
166
  images = images.cpu().permute(0, 2, 3, 1).numpy()
167
- images = (images * 255).round().astype("uint8").transpose(0, 3, 1, 2)
168
- image = Image.fromarray(images[0][0])
 
 
 
169
 
170
- if hasattr(self.pipe,
171
- 'vqvae') and self.pipe.vqvae.config['out_channels'] == 3:
172
- image = image.convert('L')
173
 
174
- audio = self.mel.image_to_audio(image)
175
- return image, (self.mel.get_sample_rate(), audio)
176
 
177
- @staticmethod
178
- def loop_it(audio: np.ndarray,
179
- sample_rate: int,
180
- loops: int = 12) -> np.ndarray:
181
- """Loop audio
182
 
183
- Args:
184
- audio (np.ndarray): audio as numpy array
185
- sample_rate (int): sample rate of audio
186
- loops (int): number of times to loop
 
187
 
188
- Returns:
189
- (float, np.ndarray): sample rate and raw audio or None
190
- """
191
- _, beats = beat_track(y=audio, sr=sample_rate, units='samples')
192
- for beats_in_bar in [16, 12, 8, 4]:
193
- if len(beats) > beats_in_bar:
194
- return np.tile(audio[beats[0]:beats[beats_in_bar]], loops)
195
- return None
 
1
+ from typing import Iterable, Tuple, Union, List
2
 
3
  import torch
4
  import numpy as np
5
  from PIL import Image
6
  from tqdm.auto import tqdm
7
  from librosa.beat import beat_track
8
+ from diffusers import (DiffusionPipeline, DDPMPipeline, UNet2DConditionModel,
9
+ DDIMScheduler, DDPMScheduler, AutoencoderKL)
10
 
11
  from .mel import Mel
12
 
13
+ VERSION = "1.2.0"
14
 
15
 
16
  class AudioDiffusion:
 
43
  hop_length=hop_length,
44
  top_db=top_db)
45
  self.model_id = model_id
46
+ try: # a bit hacky
47
+ self.pipe = LatentAudioDiffusionPipeline.from_pretrained(self.model_id)
48
+ except:
49
+ self.pipe = AudioDiffusionPipeline.from_pretrained(self.model_id)
50
+
51
  if cuda:
52
  self.pipe.to("cuda")
53
  self.progress_bar = progress_bar or (lambda _: _)
 
60
  """Generate random mel spectrogram and convert to audio.
61
 
62
  Args:
63
+ steps (int): number of de-noising steps to perform (defaults to num_train_timesteps)
64
  generator (torch.Generator): random number generator or None
65
 
66
  Returns:
67
  PIL Image: mel spectrogram
68
  (float, np.ndarray): sample rate and raw audio
69
  """
70
+ images, (sample_rate, audios) = self.pipe(mel=self.mel,
71
+ batch_size=1,
72
+ steps=steps,
73
+ generator=generator)
74
+ return images[0], (sample_rate, audios[0])
 
 
 
75
 
76
  @torch.no_grad()
77
  def generate_spectrogram_and_audio_from_audio(
 
102
  (float, np.ndarray): sample rate and raw audio
103
  """
104
 
105
+ images, (sample_rate,
106
+ audios) = self.pipe(mel=self.mel,
107
+ batch_size=1,
108
+ audio_file=audio_file,
109
+ raw_audio=raw_audio,
110
+ slice=slice,
111
+ start_step=start_step,
112
+ steps=steps,
113
+ generator=generator,
114
+ mask_start_secs=mask_start_secs,
115
+ mask_end_secs=mask_end_secs)
116
+ return images[0], (sample_rate, audios[0])
117
+
118
+ @staticmethod
119
+ def loop_it(audio: np.ndarray,
120
+ sample_rate: int,
121
+ loops: int = 12) -> np.ndarray:
122
+ """Loop audio
123
+
124
+ Args:
125
+ audio (np.ndarray): audio as numpy array
126
+ sample_rate (int): sample rate of audio
127
+ loops (int): number of times to loop
128
+
129
+ Returns:
130
+ (float, np.ndarray): sample rate and raw audio or None
131
+ """
132
+ _, beats = beat_track(y=audio, sr=sample_rate, units='samples')
133
+ for beats_in_bar in [16, 12, 8, 4]:
134
+ if len(beats) > beats_in_bar:
135
+ return np.tile(audio[beats[0]:beats[beats_in_bar]], loops)
136
+ return None
137
+
138
+
139
+ class AudioDiffusionPipeline(DiffusionPipeline):
140
+
141
+ def __init__(self, unet: UNet2DConditionModel,
142
+ scheduler: Union[DDIMScheduler, DDPMScheduler]):
143
+ super().__init__()
144
+ self.register_modules(unet=unet, scheduler=scheduler)
145
+
146
+ @torch.no_grad()
147
+ def __call__(
148
+ self,
149
+ mel: Mel,
150
+ batch_size: int = 1,
151
+ audio_file: str = None,
152
+ raw_audio: np.ndarray = None,
153
+ slice: int = 0,
154
+ start_step: int = 0,
155
+ steps: int = None,
156
+ generator: torch.Generator = None,
157
+ mask_start_secs: float = 0,
158
+ mask_end_secs: float = 0
159
+ ) -> Tuple[List[Image.Image], Tuple[int, List[np.ndarray]]]:
160
+ """Generate random mel spectrogram from audio input and convert to audio.
161
+
162
+ Args:
163
+ mel (Mel): instance of Mel class to perform image <-> audio
164
+ batch_size (int): number of samples to generate
165
+ audio_file (str): must be a file on disk due to Librosa limitation or
166
+ raw_audio (np.ndarray): audio as numpy array
167
+ slice (int): slice number of audio to convert
168
+ start_step (int): step to start from
169
+ steps (int): number of de-noising steps to perform (defaults to num_train_timesteps)
170
+ generator (torch.Generator): random number generator or None
171
+ mask_start_secs (float): number of seconds of audio to mask (not generate) at start
172
+ mask_end_secs (float): number of seconds of audio to mask (not generate) at end
173
+
174
+ Returns:
175
+ List[PIL Image]: mel spectrograms
176
+ (float, List[np.ndarray]): sample rate and raw audios
177
+ """
178
+
179
  if steps is None:
180
+ steps = self.scheduler.num_train_timesteps
181
  # Unfortunately, the schedule is set up in the constructor
182
+ scheduler = self.scheduler.__class__(num_train_timesteps=steps)
183
  scheduler.set_timesteps(steps)
184
  mask = None
185
  images = noise = torch.randn(
186
+ (batch_size, self.unet.in_channels, self.unet.sample_size,
187
+ self.unet.sample_size),
188
  generator=generator)
189
 
190
  if audio_file is not None or raw_audio is not None:
191
+ mel.load_audio(audio_file, raw_audio)
192
+ input_image = mel.audio_slice_to_image(slice)
193
  input_image = np.frombuffer(input_image.tobytes(),
194
  dtype="uint8").reshape(
195
  (input_image.height,
196
  input_image.width))
197
  input_image = ((input_image / 255) * 2 - 1)
198
+ input_images = np.tile(input_image, (batch_size, 1, 1, 1))
199
 
200
+ if hasattr(self, 'vqvae'):
201
+ input_images = self.vqvae.encode(
202
+ input_images).latent_dist.sample(generator=generator)
203
+ input_images = 0.18215 * input_images
204
 
205
  if start_step > 0:
206
  images[0, 0] = scheduler.add_noise(
207
+ torch.tensor(input_images[:, np.newaxis, np.newaxis, :]),
208
  noise, torch.tensor(steps - start_step))
209
 
210
+ pixels_per_second = (mel.get_sample_rate() *
211
+ self.unet.sample_size / mel.hop_length /
212
+ mel.x_res)
213
  mask_start = int(mask_start_secs * pixels_per_second)
214
  mask_end = int(mask_end_secs * pixels_per_second)
215
  mask = scheduler.add_noise(
216
+ torch.tensor(input_images[:, np.newaxis, :]), noise,
217
  torch.tensor(scheduler.timesteps[start_step:]))
218
 
219
+ images = images.to(self.device)
220
  for step, t in enumerate(
221
  self.progress_bar(scheduler.timesteps[start_step:])):
222
+ model_output = self.unet(images, t)['sample']
223
  images = scheduler.step(model_output,
224
  t,
225
  images,
 
227
 
228
  if mask is not None:
229
  if mask_start > 0:
230
+ images[:, :, :, :mask_start] = mask[
231
+ step, :, :, :, :mask_start]
232
  if mask_end > 0:
233
+ images[:, :, :, -mask_end:] = mask[step, :, :, :,
234
+ -mask_end:]
235
 
236
+ if hasattr(self, 'vqvae'):
237
  # 0.18215 was scaling factor used in training to ensure unit variance
 
238
  images = 1 / 0.18215 * images
239
+ images = self.vqvae.decode(images)['sample']
240
 
241
  images = (images / 2 + 0.5).clamp(0, 1)
242
  images = images.cpu().permute(0, 2, 3, 1).numpy()
243
+ images = (images * 255).round().astype("uint8")
244
+ images = list(
245
+ map(lambda _: Image.fromarray(_[:, :, 0]), images) if images.
246
+ shape[3] == 1 else map(
247
+ lambda _: Image.fromarray(_, mode='RGB').convert('L'), images))
248
 
249
+ audios = list(map(lambda _: mel.image_to_audio(_), images))
250
+ return images, (mel.get_sample_rate(), audios)
 
251
 
 
 
252
 
253
+ class LatentAudioDiffusionPipeline(AudioDiffusionPipeline):
 
 
 
 
254
 
255
+ def __init__(self, unet: UNet2DConditionModel,
256
+ scheduler: Union[DDIMScheduler,
257
+ DDPMScheduler], vqvae: AutoencoderKL):
258
+ super().__init__(unet=unet, scheduler=scheduler)
259
+ self.register_modules(vqvae=vqvae)
260
 
261
+ def __call__(self, *args, **kwargs):
262
+ return super().__call__(*args, **kwargs)
 
 
 
 
 
 
config/ldm_autoencoder_kl.yaml CHANGED
@@ -14,12 +14,12 @@ model:
14
 
15
  ddconfig:
16
  double_z: True
17
- z_channels: 3
18
  resolution: 256
19
  in_channels: 3
20
  out_ch: 3
21
  ch: 128
22
- ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1
23
  num_res_blocks: 2
24
  attn_resolutions: [ ]
25
  dropout: 0.0
 
14
 
15
  ddconfig:
16
  double_z: True
17
+ z_channels: 4
18
  resolution: 256
19
  in_channels: 3
20
  out_ch: 3
21
  ch: 128
22
+ ch_mult: [ 1,2,4,4 ] # num_down = len(ch_mult)-1
23
  num_res_blocks: 2
24
  attn_resolutions: [ ]
25
  dropout: 0.0
scripts/train_unconditional.py CHANGED
@@ -5,12 +5,11 @@ import os
5
 
6
  import torch
7
  import torch.nn.functional as F
8
- from PIL import Image
9
 
10
  from accelerate import Accelerator
11
  from accelerate.logging import get_logger
12
  from datasets import load_from_disk, load_dataset
13
- from diffusers import (DDPMPipeline, DDPMScheduler, UNet2DModel, LDMPipeline,
14
  DDIMScheduler, AutoencoderKL)
15
  from diffusers.hub_utils import init_git_repo, push_to_hub
16
  from diffusers.optimization import get_scheduler
@@ -23,10 +22,12 @@ from torchvision.transforms import (
23
  Resize,
24
  ToTensor,
25
  )
 
26
  from tqdm.auto import tqdm
27
  from librosa.util import normalize
28
 
29
  from audiodiffusion.mel import Mel
 
30
 
31
  logger = get_logger(__name__)
32
 
@@ -45,7 +46,7 @@ def main(args):
45
  vqvae = AutoencoderKL.from_pretrained(args.vae)
46
 
47
  if args.from_pretrained is not None:
48
- model = DDPMPipeline.from_pretrained(args.from_pretrained).unet
49
  else:
50
  model = UNet2DModel(
51
  sample_size=args.resolution
@@ -237,12 +238,14 @@ def main(args):
237
  if accelerator.is_main_process:
238
  if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
239
  if args.vae is not None:
240
- pipeline = LDMPipeline(unet=accelerator.unwrap_model(
241
- ema_model.averaged_model if args.use_ema else model),
242
- vqvae=vqvae,
243
- scheduler=noise_scheduler)
 
 
244
  else:
245
- pipeline = DDPMPipeline(
246
  unet=accelerator.unwrap_model(
247
  ema_model.averaged_model if args.use_ema else model
248
  ),
@@ -267,33 +270,27 @@ def main(args):
267
  if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1:
268
  generator = torch.manual_seed(42)
269
  # run pipeline in inference (sample random noise and denoise)
270
- with torch.no_grad():
271
- images = pipeline(
272
- generator=generator,
273
- batch_size=args.eval_batch_size,
274
- output_type="numpy",
275
- num_inference_steps=args.num_train_steps,
276
- )["sample"]
277
 
278
  # denormalize the images and save to tensorboard
279
- images_processed = ((images *
280
- 255).round().astype("uint8").transpose(
281
- 0, 3, 1, 2))
 
 
282
  accelerator.trackers[0].writer.add_images(
283
- "test_samples", images_processed, epoch)
284
- for _, image in enumerate(images_processed):
285
- image = Image.fromarray(image[0])
286
-
287
- if args.vae is not None and vqvae.config[
288
- 'out_channels'] == 3:
289
- image = image.convert('L')
290
-
291
- audio = mel.image_to_audio(image)
292
  accelerator.trackers[0].writer.add_audio(
293
  f"test_audio_{_}",
294
  normalize(audio),
295
  epoch,
296
- sample_rate=mel.get_sample_rate(),
297
  )
298
  accelerator.wait_for_everyone()
299
 
@@ -353,7 +350,7 @@ if __name__ == "__main__":
353
  parser.add_argument("--from_pretrained", type=str, default=None)
354
  parser.add_argument("--start_epoch", type=int, default=0)
355
  parser.add_argument("--num_train_steps", type=int, default=1000)
356
- parser.add_argument("--latent_resolution", type=int, default=64)
357
  parser.add_argument("--scheduler",
358
  type=str,
359
  default="ddpm",
 
5
 
6
  import torch
7
  import torch.nn.functional as F
 
8
 
9
  from accelerate import Accelerator
10
  from accelerate.logging import get_logger
11
  from datasets import load_from_disk, load_dataset
12
+ from diffusers import (DiffusionPipeline, DDPMScheduler, UNet2DModel,
13
  DDIMScheduler, AutoencoderKL)
14
  from diffusers.hub_utils import init_git_repo, push_to_hub
15
  from diffusers.optimization import get_scheduler
 
22
  Resize,
23
  ToTensor,
24
  )
25
+ import numpy as np
26
  from tqdm.auto import tqdm
27
  from librosa.util import normalize
28
 
29
  from audiodiffusion.mel import Mel
30
+ from audiodiffusion import LatentAudioDiffusionPipeline, AudioDiffusionPipeline
31
 
32
  logger = get_logger(__name__)
33
 
 
46
  vqvae = AutoencoderKL.from_pretrained(args.vae)
47
 
48
  if args.from_pretrained is not None:
49
+ model = DiffusionPipeline.from_pretrained(args.from_pretrained).unet
50
  else:
51
  model = UNet2DModel(
52
  sample_size=args.resolution
 
238
  if accelerator.is_main_process:
239
  if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
240
  if args.vae is not None:
241
+ pipeline = LatentAudioDiffusionPipeline(
242
+ unet=accelerator.unwrap_model(
243
+ ema_model.averaged_model if args.use_ema else model
244
+ ),
245
+ vqvae=vqvae,
246
+ scheduler=noise_scheduler)
247
  else:
248
+ pipeline = AudioDiffusionPipeline(
249
  unet=accelerator.unwrap_model(
250
  ema_model.averaged_model if args.use_ema else model
251
  ),
 
270
  if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1:
271
  generator = torch.manual_seed(42)
272
  # run pipeline in inference (sample random noise and denoise)
273
+ images, (sample_rate, audios) = pipeline(
274
+ mel=mel,
275
+ generator=generator,
276
+ batch_size=args.eval_batch_size,
277
+ steps=args.num_train_steps,
278
+ )
 
279
 
280
  # denormalize the images and save to tensorboard
281
+ images = np.array([
282
+ np.frombuffer(image.tobytes(), dtype="uint8").reshape(
283
+ (len(image.getbands()), image.height, image.width))
284
+ for image in images
285
+ ])
286
  accelerator.trackers[0].writer.add_images(
287
+ "test_samples", images, epoch)
288
+ for _, audio in enumerate(audios):
 
 
 
 
 
 
 
289
  accelerator.trackers[0].writer.add_audio(
290
  f"test_audio_{_}",
291
  normalize(audio),
292
  epoch,
293
+ sample_rate=sample_rate,
294
  )
295
  accelerator.wait_for_everyone()
296
 
 
350
  parser.add_argument("--from_pretrained", type=str, default=None)
351
  parser.add_argument("--start_epoch", type=int, default=0)
352
  parser.add_argument("--num_train_steps", type=int, default=1000)
353
+ parser.add_argument("--latent_resolution", type=int, default=None)
354
  parser.add_argument("--scheduler",
355
  type=str,
356
  default="ddpm",
scripts/train_vae.py CHANGED
@@ -1,10 +1,8 @@
1
  # pip install -e git+https://github.com/CompVis/stable-diffusion.git@master
2
  # pip install -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
3
- # convert_original_stable_diffusion_to_diffusers.py
4
 
5
  # TODO
6
  # grayscale
7
- # update generate from audio to include vae step
8
 
9
  import os
10
  import argparse
 
1
  # pip install -e git+https://github.com/CompVis/stable-diffusion.git@master
2
  # pip install -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
 
3
 
4
  # TODO
5
  # grayscale
 
6
 
7
  import os
8
  import argparse