File size: 4,343 Bytes
86694c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d909077
86694c3
 
 
 
 
 
 
 
 
 
 
 
 
 
d909077
86694c3
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import numpy as np

# import keras
from kapre.time_frequency import Spectrogram
from tensorflow import keras

from generators.generator import *
from models.common.architectures import layers_map


"""
The STFT spectrogram of the input signal is fed
into a 2D CNN that predicts the synthesizer parameter
configuration. This configuration is then used to produce
a sound that is similar to the input sound.
"""


"""Model Architecture"""
# @ paper:
# 1 2D Strided Convolution Layer C(38,13,26,13,26)
# where C(F,K1,K2,S1,S2) stands for a ReLU activated
# 2D strided convolutional layer with F filters in size of (K1,K2)
# and strides (S1,S2).


def assemble_model(
    src: np.ndarray,
    n_outputs: int,
    arch_layers: list,
    n_dft: int = 512,  # Orig:128
    n_hop: int = 256,  #  Orig:64
    data_format: str = "channels_first",
) -> keras.Model:

    inputs = keras.Input(shape=src.shape, name="stft")

    # @paper: Spectrogram based CNN that receives the (log) spectrogram matrix as input

    # @kapre:
    # abs(Spectrogram) in a shape of 2D data, i.e.,
    # `(None, n_channel, n_freq, n_time)` if `'channels_first'`,
    # `(None, n_freq, n_time, n_channel)` if `'channels_last'`,
    x = Spectrogram(
        n_dft=n_dft,
        n_hop=n_hop,
        input_shape=src.shape,
        trainable_kernel=True,
        name="static_stft",
        image_data_format=data_format,
        return_decibel_spectrogram=True,
    )(inputs)

    # Swaps order to match the paper?
    # TODO: dig in to this (GPU only?)
    if data_format == "channels_first":  # n_channel, n_freq, n_time)
        x = keras.layers.Permute((1, 3, 2))(x)
    else:
        x = keras.layers.Permute((2, 1, 3))(x)

    # x = keras.layers.Conv2D(64,(3,3),strides=(2,2),activation="relu",data_format="channels_last", padding='same')(x)
    # x = keras.layers.Conv2D(128,(3,3),strides=(2,2),activation="relu",data_format="channels_last", padding='same')(x)
    # x = keras.layers.Conv2D(128,(3,4),strides=(2,3),activation="relu",data_format="channels_last", padding='same')(x)
    # x = keras.layers.Conv2D(128,(3,3),strides=(2,2),activation="relu",data_format="channels_last", padding='same')(x)
    # x = keras.layers.Conv2D(256,(3,3),strides=(2,2),activation="relu",data_format="channels_last", padding='same')(x)
    # x = keras.layers.Conv2D(256,(3,3),strides=(1,2),activation="relu",data_format="channels_last", padding='same')(x)
    for arch_layer in arch_layers:
        x = keras.layers.Conv2D(
            arch_layer.filters,
            arch_layer.window_size,
            strides=arch_layer.strides,
            activation=arch_layer.activation,
            data_format=data_format,
            padding='same'
        )(x)

    # Flatten down to a single dimension
    x = keras.layers.Flatten()(x)

    # @paper: sigmoid activations with binary cross entropy loss
    # @paper: FC-512
    x = keras.layers.Dense(512)(x)

    # @paper: FC-368(sigmoid)
    outputs = keras.layers.Dense(n_outputs, activation="sigmoid", name="predictions")(x)

    return keras.Model(inputs=inputs, outputs=outputs)


"""
Standard callback to get a model ready to train
"""


def get_model(
    model_name: str, inputs: int, outputs: int, data_format: str = "channels_last"
) -> keras.Model:
    arch_layers = layers_map.get("C1")
    if model_name in layers_map:
        arch_layers = layers_map.get(model_name)
    else:
        print(
            f"Warning: {model_name} is not compatible with the spectrogram model. C1 Architecture will be used instead."
        )
    return assemble_model(
        np.zeros([1, inputs]),
        n_outputs=outputs,
        arch_layers=arch_layers,
        data_format=data_format,
    )


if __name__ == "__main__":

    from models.launch import train_model, inference
    from models.runner import standard_run_parser

    # Get a standard parser, and the arguments out of it
    parser = standard_run_parser()
    args = parser.parse_args()
    setup = vars(args)
    print(setup)
    # distinguish model type for reshaping
    setup["model_type"] = "STFT"
    # tf.config.run_functions_eagerly(True)
    # Actually train the model
    
    model, parameters_file = train_model(model_callback=get_model, **setup)
    
    file_path, csv_path = inference(model, parameters_file)
    
    print(file_path)
    
    print(csv_path)