Commit
·
b4e09e8
0
Parent(s):
Duplicate from kashif/music-spectrogram-diffusion
Browse filesCo-authored-by: Kashif Rasul <[email protected]>
- .gitattributes +34 -0
- README.md +30 -0
- continuous_encoder/config.json +14 -0
- continuous_encoder/diffusion_pytorch_model.bin +3 -0
- decoder/config.json +13 -0
- decoder/diffusion_pytorch_model.bin +3 -0
- melgan/model.onnx +3 -0
- model_index.json +24 -0
- notes_encoder/config.json +14 -0
- notes_encoder/diffusion_pytorch_model.bin +3 -0
- scheduler/scheduler_config.json +13 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
tags:
|
4 |
+
- pytorch
|
5 |
+
- diffusers
|
6 |
+
duplicated_from: kashif/music-spectrogram-diffusion
|
7 |
+
---
|
8 |
+
|
9 |
+
# Multi-instrument Music Synthesis with Spectrogram Diffusion
|
10 |
+
|
11 |
+
[Spectrogram Diffusion](https://arxiv.org/abs/2206.05408) by Curtis Hawthorne, Ian Simon, Adam Roberts, Neil Zeghidour, Josh Gardner, Ethan Manilow, and Jesse Engel.
|
12 |
+
|
13 |
+
## Abstract
|
14 |
+
|
15 |
+
An ideal music synthesizer should be both interactive and expressive, generating high-fidelity audio in realtime for arbitrary combinations of instruments and notes. Recent neural synthesizers have exhibited a tradeoff between domain-specific models that offer detailed control of only specific instruments, or raw waveform models that can train on any music but with minimal control and slow generation. In this work, we focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments. We use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter. We compare training the decoder as an autoregressive model and as a Denoising Diffusion Probabilistic Model (DDPM) and find that the DDPM approach is superior both qualitatively and as measured by audio reconstruction and Fréchet distance metrics. Given the interactivity and generality of this approach, we find this to be a promising first step towards interactive and expressive neural synthesis for arbitrary combinations of instruments and notes.
|
16 |
+
|
17 |
+
<img src="https://storage.googleapis.com/music-synthesis-with-spectrogram-diffusion/architecture.png" alt="Architecture diagram">
|
18 |
+
|
19 |
+
## Example usage
|
20 |
+
|
21 |
+
```python
|
22 |
+
from diffusers import SpectrogramDiffusionPipeline
|
23 |
+
|
24 |
+
pipe = SpectrogramDiffusionPipeline.from_pretrained("kashif/music-spectrogram-diffusion")
|
25 |
+
pipe = pipe.to("cuda")
|
26 |
+
|
27 |
+
output = pipe("beethoven_hammerklavier_2.mid")
|
28 |
+
|
29 |
+
audio = output.audios[0]
|
30 |
+
```
|
continuous_encoder/config.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "SpectrogramContEncoder",
|
3 |
+
"_diffusers_version": "0.14.0.dev0",
|
4 |
+
"d_ff": 2048,
|
5 |
+
"d_kv": 64,
|
6 |
+
"d_model": 768,
|
7 |
+
"dropout_rate": 0.1,
|
8 |
+
"feed_forward_proj": "gated-gelu",
|
9 |
+
"input_dims": 128,
|
10 |
+
"is_decoder": false,
|
11 |
+
"num_heads": 12,
|
12 |
+
"num_layers": 12,
|
13 |
+
"targets_context_length": 256
|
14 |
+
}
|
continuous_encoder/diffusion_pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9ce0037cc2f9aad486141769202fed6a112db29487a358651424d1354ba52038
|
3 |
+
size 341044659
|
decoder/config.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "T5FilmDecoder",
|
3 |
+
"_diffusers_version": "0.14.0.dev0",
|
4 |
+
"d_ff": 2048,
|
5 |
+
"d_kv": 64,
|
6 |
+
"d_model": 768,
|
7 |
+
"dropout_rate": 0.1,
|
8 |
+
"input_dims": 128,
|
9 |
+
"max_decoder_noise_time": 20000.0,
|
10 |
+
"num_heads": 12,
|
11 |
+
"num_layers": 12,
|
12 |
+
"targets_length": 256
|
13 |
+
}
|
decoder/diffusion_pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eda770548b0ac2432973dee6a99068fdf2120454e7599ea3b62acebde6aa1d5b
|
3 |
+
size 954931957
|
melgan/model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0f7bfb7c042cfed63b133aa26ec440f7b2d08192823fbc2363499696b3720603
|
3 |
+
size 60487709
|
model_index.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "SpectrogramDiffusionPipeline",
|
3 |
+
"_diffusers_version": "0.14.0.dev0",
|
4 |
+
"continuous_encoder": [
|
5 |
+
"spectrogram_diffusion",
|
6 |
+
"SpectrogramContEncoder"
|
7 |
+
],
|
8 |
+
"decoder": [
|
9 |
+
"diffusers",
|
10 |
+
"T5FilmDecoder"
|
11 |
+
],
|
12 |
+
"melgan": [
|
13 |
+
"diffusers",
|
14 |
+
"OnnxRuntimeModel"
|
15 |
+
],
|
16 |
+
"notes_encoder": [
|
17 |
+
"spectrogram_diffusion",
|
18 |
+
"SpectrogramNotesEncoder"
|
19 |
+
],
|
20 |
+
"scheduler": [
|
21 |
+
"diffusers",
|
22 |
+
"DDPMScheduler"
|
23 |
+
]
|
24 |
+
}
|
notes_encoder/config.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "SpectrogramNotesEncoder",
|
3 |
+
"_diffusers_version": "0.14.0.dev0",
|
4 |
+
"d_ff": 2048,
|
5 |
+
"d_kv": 64,
|
6 |
+
"d_model": 768,
|
7 |
+
"dropout_rate": 0.1,
|
8 |
+
"feed_forward_proj": "gated-gelu",
|
9 |
+
"is_decoder": false,
|
10 |
+
"max_length": 2048,
|
11 |
+
"num_heads": 12,
|
12 |
+
"num_layers": 12,
|
13 |
+
"vocab_size": 1536
|
14 |
+
}
|
notes_encoder/diffusion_pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be7f951c6ed13ef10ae9298c10c315d9d27a29a4da42fad9f24355967fd37e4c
|
3 |
+
size 350875059
|
scheduler/scheduler_config.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "DDPMScheduler",
|
3 |
+
"_diffusers_version": "0.14.0.dev0",
|
4 |
+
"beta_end": 0.02,
|
5 |
+
"beta_schedule": "squaredcos_cap_v2",
|
6 |
+
"beta_start": 0.0001,
|
7 |
+
"clip_sample": true,
|
8 |
+
"clip_sample_range": 1.0,
|
9 |
+
"num_train_timesteps": 1000,
|
10 |
+
"prediction_type": "epsilon",
|
11 |
+
"trained_betas": null,
|
12 |
+
"variance_type": "fixed_large"
|
13 |
+
}
|