|
import re |
|
from abc import abstractmethod |
|
from contextlib import contextmanager |
|
from typing import Any, Dict, Tuple, Union |
|
|
|
import pytorch_lightning as pl |
|
import torch |
|
from omegaconf import ListConfig |
|
from packaging import version |
|
from safetensors.torch import load_file as load_safetensors |
|
|
|
from ..modules.diffusionmodules.model import Decoder, Encoder |
|
from ..modules.distributions.distributions import DiagonalGaussianDistribution |
|
from ..modules.ema import LitEma |
|
from ..util import default, get_obj_from_str, instantiate_from_config |
|
|
|
|
|
class AbstractAutoencoder(pl.LightningModule): |
|
""" |
|
This is the base class for all autoencoders, including image autoencoders, image autoencoders with discriminators, |
|
unCLIP models, etc. Hence, it is fairly general, and specific features |
|
(e.g. discriminator training, encoding, decoding) must be implemented in subclasses. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
ema_decay: Union[None, float] = None, |
|
monitor: Union[None, str] = None, |
|
input_key: str = "jpg", |
|
ckpt_path: Union[None, str] = None, |
|
ignore_keys: Union[Tuple, list, ListConfig] = (), |
|
): |
|
super().__init__() |
|
self.input_key = input_key |
|
self.use_ema = ema_decay is not None |
|
if monitor is not None: |
|
self.monitor = monitor |
|
|
|
if self.use_ema: |
|
self.model_ema = LitEma(self, decay=ema_decay) |
|
print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.") |
|
|
|
if ckpt_path is not None: |
|
self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) |
|
|
|
if version.parse(torch.__version__) >= version.parse("2.0.0"): |
|
self.automatic_optimization = False |
|
|
|
def init_from_ckpt( |
|
self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple() |
|
) -> None: |
|
if path.endswith("ckpt"): |
|
sd = torch.load(path, map_location="cpu")["state_dict"] |
|
elif path.endswith("safetensors"): |
|
sd = load_safetensors(path) |
|
else: |
|
raise NotImplementedError |
|
|
|
keys = list(sd.keys()) |
|
for k in keys: |
|
for ik in ignore_keys: |
|
if re.match(ik, k): |
|
print("Deleting key {} from state_dict.".format(k)) |
|
del sd[k] |
|
missing, unexpected = self.load_state_dict(sd, strict=False) |
|
print( |
|
f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys" |
|
) |
|
if len(missing) > 0: |
|
print(f"Missing Keys: {missing}") |
|
if len(unexpected) > 0: |
|
print(f"Unexpected Keys: {unexpected}") |
|
|
|
@abstractmethod |
|
def get_input(self, batch) -> Any: |
|
raise NotImplementedError() |
|
|
|
def on_train_batch_end(self, *args, **kwargs): |
|
|
|
if self.use_ema: |
|
self.model_ema(self) |
|
|
|
@contextmanager |
|
def ema_scope(self, context=None): |
|
if self.use_ema: |
|
self.model_ema.store(self.parameters()) |
|
self.model_ema.copy_to(self) |
|
if context is not None: |
|
print(f"{context}: Switched to EMA weights") |
|
try: |
|
yield None |
|
finally: |
|
if self.use_ema: |
|
self.model_ema.restore(self.parameters()) |
|
if context is not None: |
|
print(f"{context}: Restored training weights") |
|
|
|
@abstractmethod |
|
def encode(self, *args, **kwargs) -> torch.Tensor: |
|
raise NotImplementedError("encode()-method of abstract base class called") |
|
|
|
@abstractmethod |
|
def decode(self, *args, **kwargs) -> torch.Tensor: |
|
raise NotImplementedError("decode()-method of abstract base class called") |
|
|
|
def instantiate_optimizer_from_config(self, params, lr, cfg): |
|
print(f"loading >>> {cfg['target']} <<< optimizer from config") |
|
return get_obj_from_str(cfg["target"])( |
|
params, lr=lr, **cfg.get("params", dict()) |
|
) |
|
|
|
def configure_optimizers(self) -> Any: |
|
raise NotImplementedError() |
|
|
|
|
|
class AutoencodingEngine(AbstractAutoencoder): |
|
""" |
|
Base class for all image autoencoders that we train, like VQGAN or AutoencoderKL |
|
(we also restore them explicitly as special cases for legacy reasons). |
|
Regularizations such as KL or VQ are moved to the regularizer class. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
*args, |
|
encoder_config: Dict, |
|
decoder_config: Dict, |
|
loss_config: Dict, |
|
regularizer_config: Dict, |
|
optimizer_config: Union[Dict, None] = None, |
|
lr_g_factor: float = 1.0, |
|
**kwargs, |
|
): |
|
super().__init__(*args, **kwargs) |
|
|
|
self.encoder = instantiate_from_config(encoder_config) |
|
self.decoder = instantiate_from_config(decoder_config) |
|
self.loss = instantiate_from_config(loss_config) |
|
self.regularization = instantiate_from_config(regularizer_config) |
|
self.optimizer_config = default( |
|
optimizer_config, {"target": "torch.optim.Adam"} |
|
) |
|
self.lr_g_factor = lr_g_factor |
|
|
|
def get_input(self, batch: Dict) -> torch.Tensor: |
|
|
|
|
|
return batch[self.input_key] |
|
|
|
def get_autoencoder_params(self) -> list: |
|
params = ( |
|
list(self.encoder.parameters()) |
|
+ list(self.decoder.parameters()) |
|
+ list(self.regularization.get_trainable_parameters()) |
|
+ list(self.loss.get_trainable_autoencoder_parameters()) |
|
) |
|
return params |
|
|
|
def get_discriminator_params(self) -> list: |
|
params = list(self.loss.get_trainable_parameters()) |
|
return params |
|
|
|
def get_last_layer(self): |
|
return self.decoder.get_last_layer() |
|
|
|
def encode(self, x: Any, return_reg_log: bool = False) -> Any: |
|
z = self.encoder(x) |
|
z, reg_log = self.regularization(z) |
|
if return_reg_log: |
|
return z, reg_log |
|
return z |
|
|
|
def decode(self, z: Any) -> torch.Tensor: |
|
x = self.decoder(z) |
|
return x |
|
|
|
def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: |
|
z, reg_log = self.encode(x, return_reg_log=True) |
|
dec = self.decode(z) |
|
return z, dec, reg_log |
|
|
|
def training_step(self, batch, batch_idx, optimizer_idx) -> Any: |
|
x = self.get_input(batch) |
|
z, xrec, regularization_log = self(x) |
|
|
|
if optimizer_idx == 0: |
|
|
|
aeloss, log_dict_ae = self.loss( |
|
regularization_log, |
|
x, |
|
xrec, |
|
optimizer_idx, |
|
self.global_step, |
|
last_layer=self.get_last_layer(), |
|
split="train", |
|
) |
|
|
|
self.log_dict( |
|
log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True |
|
) |
|
return aeloss |
|
|
|
if optimizer_idx == 1: |
|
|
|
discloss, log_dict_disc = self.loss( |
|
regularization_log, |
|
x, |
|
xrec, |
|
optimizer_idx, |
|
self.global_step, |
|
last_layer=self.get_last_layer(), |
|
split="train", |
|
) |
|
self.log_dict( |
|
log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True |
|
) |
|
return discloss |
|
|
|
def validation_step(self, batch, batch_idx) -> Dict: |
|
log_dict = self._validation_step(batch, batch_idx) |
|
with self.ema_scope(): |
|
log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema") |
|
log_dict.update(log_dict_ema) |
|
return log_dict |
|
|
|
def _validation_step(self, batch, batch_idx, postfix="") -> Dict: |
|
x = self.get_input(batch) |
|
|
|
z, xrec, regularization_log = self(x) |
|
aeloss, log_dict_ae = self.loss( |
|
regularization_log, |
|
x, |
|
xrec, |
|
0, |
|
self.global_step, |
|
last_layer=self.get_last_layer(), |
|
split="val" + postfix, |
|
) |
|
|
|
discloss, log_dict_disc = self.loss( |
|
regularization_log, |
|
x, |
|
xrec, |
|
1, |
|
self.global_step, |
|
last_layer=self.get_last_layer(), |
|
split="val" + postfix, |
|
) |
|
self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"]) |
|
log_dict_ae.update(log_dict_disc) |
|
self.log_dict(log_dict_ae) |
|
return log_dict_ae |
|
|
|
def configure_optimizers(self) -> Any: |
|
ae_params = self.get_autoencoder_params() |
|
disc_params = self.get_discriminator_params() |
|
|
|
opt_ae = self.instantiate_optimizer_from_config( |
|
ae_params, |
|
default(self.lr_g_factor, 1.0) * self.learning_rate, |
|
self.optimizer_config, |
|
) |
|
opt_disc = self.instantiate_optimizer_from_config( |
|
disc_params, self.learning_rate, self.optimizer_config |
|
) |
|
|
|
return [opt_ae, opt_disc], [] |
|
|
|
@torch.no_grad() |
|
def log_images(self, batch: Dict, **kwargs) -> Dict: |
|
log = dict() |
|
x = self.get_input(batch) |
|
_, xrec, _ = self(x) |
|
log["inputs"] = x |
|
log["reconstructions"] = xrec |
|
with self.ema_scope(): |
|
_, xrec_ema, _ = self(x) |
|
log["reconstructions_ema"] = xrec_ema |
|
return log |
|
|
|
|
|
class AutoencoderKL(AutoencodingEngine): |
|
def __init__(self, embed_dim: int, **kwargs): |
|
ddconfig = kwargs.pop("ddconfig") |
|
ckpt_path = kwargs.pop("ckpt_path", None) |
|
ignore_keys = kwargs.pop("ignore_keys", ()) |
|
super().__init__( |
|
encoder_config={"target": "torch.nn.Identity"}, |
|
decoder_config={"target": "torch.nn.Identity"}, |
|
regularizer_config={"target": "torch.nn.Identity"}, |
|
loss_config=kwargs.pop("lossconfig"), |
|
**kwargs, |
|
) |
|
assert ddconfig["double_z"] |
|
self.encoder = Encoder(**ddconfig) |
|
self.decoder = Decoder(**ddconfig) |
|
self.quant_conv = torch.nn.Conv2d(2 * ddconfig["z_channels"], 2 * embed_dim, 1) |
|
self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1) |
|
self.embed_dim = embed_dim |
|
|
|
if ckpt_path is not None: |
|
self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) |
|
|
|
def encode(self, x): |
|
assert ( |
|
not self.training |
|
), f"{self.__class__.__name__} only supports inference currently" |
|
h = self.encoder(x) |
|
moments = self.quant_conv(h) |
|
posterior = DiagonalGaussianDistribution(moments) |
|
return posterior |
|
|
|
def decode(self, z, **decoder_kwargs): |
|
z = self.post_quant_conv(z) |
|
dec = self.decoder(z, **decoder_kwargs) |
|
return dec |
|
|
|
|
|
class AutoencoderKLInferenceWrapper(AutoencoderKL): |
|
def encode(self, x): |
|
return super().encode(x).sample() |
|
|
|
|
|
class IdentityFirstStage(AbstractAutoencoder): |
|
def __init__(self, *args, **kwargs): |
|
super().__init__(*args, **kwargs) |
|
|
|
def get_input(self, x: Any) -> Any: |
|
return x |
|
|
|
def encode(self, x: Any, *args, **kwargs) -> Any: |
|
return x |
|
|
|
def decode(self, x: Any, *args, **kwargs) -> Any: |
|
return x |
|
|