dim: 1024 depth: 22 heads: 16 ff_mult: 2 text_dim: 512 conv_layers: 4