Upload 10 files

Files changed (8) hide show

lycoris/dylora.py +175 -0
lycoris/ia3.py +68 -0
lycoris/kohya.py +262 -28
lycoris/kohya_model_utils.py +977 -996
lycoris/locon.py +2 -1
lycoris/loha.py +2 -1
lycoris/lokr.py +220 -0
lycoris/utils.py +241 -185

lycoris/dylora.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import math
+import random
+from collections import OrderedDict, abc as container_abcs
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class DyLoraModule(nn.Module):
+    """
+    Hadamard product Implementaion for Dynamic Low Rank adaptation
+    """
+    def __init__(
+        self,
+        lora_name,
+        org_module: nn.Module,
+        multiplier=1.0,
+        lora_dim=4, alpha=1,
+        dropout=0.,
+        use_cp=False,
+        block_size=1,
+        **kwargs,
+    ):
+        """ if alpha == 0 or None, alpha is rank (no scaling). """
+        super().__init__()
+        self.lora_name = lora_name
+        self.lora_dim = lora_dim
+        assert lora_dim % block_size == 0, 'lora_dim must be a multiple of block_size'
+        self.block_count = lora_dim//block_size
+        self.block_size = block_size
+        self.shape = org_module.weight.shape
+        if org_module.__class__.__name__ == 'Conv2d':
+            in_dim = org_module.in_channels
+            k_size = org_module.kernel_size
+            out_dim = org_module.out_channels
+            shape = (out_dim, in_dim*k_size[0]*k_size[1])
+            self.op = F.conv2d
+            self.extra_args = {
+                "stride": org_module.stride,
+                "padding": org_module.padding,
+                "dilation": org_module.dilation,
+                "groups": org_module.groups
+            }
+        else:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+            shape = (out_dim, in_dim)
+            self.op = F.linear
+            self.extra_args = {}
+        self.lora_dim = lora_dim
+        self.up_list = nn.ParameterList([
+            torch.empty(shape[0], 1)
+            for i in range(lora_dim)
+        ])
+        self.up_list.requires_grad_(False)
+        self.up_update = [
+            torch.zeros_like(self.up_list[i])
+            for i in range(lora_dim)
+        ]
+        self.down_list = nn.ParameterList([
+            torch.empty(1, shape[1])
+            for i in range(lora_dim)
+        ])
+        self.down_list.requires_grad_(False)
+        self.down_update = [
+            torch.zeros_like(self.down_list[i])
+            for i in range(lora_dim)
+        ]
+        self.index = 0
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        alpha = lora_dim if alpha is None or alpha == 0 else alpha
+        self.scale = alpha / self.lora_dim
+        self.register_buffer('alpha', torch.tensor(alpha)) # 定数として扱える
+        # Need more experiences on init method
+        for v in self.down_list:
+            torch.nn.init.kaiming_uniform_(v, a=math.sqrt(5))
+        for v in self.up_list:
+            torch.nn.init.zeros_(v)
+        for i, v in enumerate(self.up_update):
+            v.copy_(self.up_list[i])
+        for i, v in enumerate(self.down_update):
+            v.copy_(self.down_list[i])
+        self.multiplier = multiplier
+        self.org_module = [org_module] # remove in applying
+        self.grad_ckpt = False
+        self.apply_train(0)
+    def state_dict(self, *args, destination=None, prefix='', keep_vars=False):
+        # TODO: Remove `args` and the parsing logic when BC allows.
+        if len(args) > 0:
+            if destination is None:
+                destination = args[0]
+            if len(args) > 1 and prefix == '':
+                prefix = args[1]
+            if len(args) > 2 and keep_vars is False:
+                keep_vars = args[2]
+            # DeprecationWarning is ignored by default
+        if destination is None:
+            destination = OrderedDict()
+            destination._metadata = OrderedDict()
+        local_metadata = dict(version=self._version)
+        if hasattr(destination, "_metadata"):
+            destination._metadata[prefix[:-1]] = local_metadata
+        destination[f'{prefix}alpha'] = self.alpha
+        destination[f'{prefix}lora_up.weight'] = nn.Parameter(
+            torch.concat(self.up_update, dim=1)
+        )
+        destination[f'{prefix}lora_down.weight'] = nn.Parameter(
+            torch.concat(self.down_update)
+        )
+        return destination
+    def apply_to(self):
+        self.org_module[0].forward = self.forward
+    def apply_train(self, b:int):
+        self.up_list.requires_grad_(False)
+        self.down_list.requires_grad_(False)
+        for i in range(self.index*self.block_size, (self.index+1)*self.block_size):
+            self.up_update[i].copy_(self.up_list[i])
+            self.down_update[i].copy_(self.down_list[i])
+        for i in range(b*self.block_size, (b+1)*self.block_size):
+            self.up_list[i].copy_(self.up_update[i])
+            self.down_list[i].copy_(self.down_update[i])
+        self.up_list.requires_grad_(True)
+        self.down_list.requires_grad_(True)
+        self.index = b
+    @torch.enable_grad()
+    def forward(self, x):
+        b = random.randint(0, self.block_count-1)
+        if self.up_update[b].device != self.up_list[b].device:
+            device = self.up_list[b].device
+            for i in range(self.lora_dim):
+                self.up_update[i] = self.up_update[i].to(device)
+                self.down_update[i] = self.down_update[i].to(device)
+        if self.training:
+            self.apply_train(b)
+        down = torch.concat(
+            list(self.down_update[:b*self.block_size])
+            + list(self.down_list[b*self.block_size:(b+1)*self.block_size])
+        )
+        up = torch.concat(
+            list(self.up_update[:b*self.block_size])
+            + list(self.up_list[b*self.block_size:(b+1)*self.block_size]),
+            dim=1
+        )
+        bias = None if self.org_module[0].bias is None else self.org_module[0].bias.data
+        return self.op(
+            x,
+            self.org_module[0].weight + (up@down).view(self.shape) * self.alpha/(b+1),
+            bias,
+            **self.extra_args
+        )

lycoris/ia3.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class IA3Module(nn.Module):
+    """
+    Hadamard product Implementaion for Low Rank Adaptation
+    """
+    def __init__(
+        self,
+        lora_name,
+        org_module: nn.Module,
+        multiplier=1.0,
+        train_on_input=False,
+        **kwargs
+    ):
+        """ if alpha == 0 or None, alpha is rank (no scaling). """
+        super().__init__()
+        self.lora_name = lora_name
+        self.cp=False
+        self.shape = org_module.weight.shape
+        if org_module.__class__.__name__ == 'Conv2d':
+            in_dim = org_module.in_channels
+            out_dim = org_module.out_channels
+            if train_on_input:
+                train_dim = in_dim
+            else:
+                train_dim = out_dim
+            self.weight = nn.Parameter(torch.empty(1, train_dim, 1, 1))
+        else:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+            if train_on_input:
+                train_dim = in_dim
+            else:
+                train_dim = out_dim
+            self.weight = nn.Parameter(torch.empty(train_dim))
+        # Need more experiences on init method
+        torch.nn.init.constant_(self.weight, 0)
+        self.multiplier = multiplier
+        self.org_forward = None
+        self.org_module = [org_module] # remove in applying
+        self.grad_ckpt = False
+        self.train_input = train_on_input
+        self.register_buffer('on_input', torch.tensor(int(train_on_input)))
+    def apply_to(self):
+        self.org_forward = self.org_module[0].forward
+        self.org_module[0].forward = self.forward
+    @torch.enable_grad()
+    def forward(self, x):
+        if self.train_input:
+            x = x * (1 + self.weight * self.multiplier)
+        out = self.org_forward(x)
+        dtype = out.dtype
+        if not self.train_input:
+            out = out * (1 + self.weight * self.multiplier)
+            out = out.to(dtype)
+        return out

lycoris/kohya.py CHANGED Viewed

@@ -13,6 +13,9 @@ import torch
 from .kohya_utils import *
 from .locon import LoConModule
 from .loha import LohaModule
 def create_network(multiplier, network_dim, network_alpha, vae, text_encoder, unet, **kwargs):
@@ -21,39 +24,55 @@ def create_network(multiplier, network_dim, network_alpha, vae, text_encoder, un
     conv_dim = int(kwargs.get('conv_dim', network_dim))
     conv_alpha = float(kwargs.get('conv_alpha', network_alpha))
     dropout = float(kwargs.get('dropout', 0.))
-    algo = kwargs.get('algo', 'lora')
-    disable_cp = kwargs.get('disable_conv_cp', False)
     network_module = {
         'lora': LoConModule,
         'loha': LohaModule,
     }[algo]
     print(f'Using rank adaptation algo: {algo}')
-    if (algo == 'loha'
         and not kwargs.get('no_dim_warn', False)
         and (network_dim>64 or conv_dim>64)):
         print('='*20 + 'WARNING' + '='*20)
-        warn(
-            (
-                "You are not supposed to use dim>64 (64*64 = 4096, it already has enough rank)"
-                "in Hadamard Product representation!\n"
-                "Please consider use lower dim or disable this warning with --network_args no_dim_warn=True\n"
-                "If you just want to use high dim loha, please consider use lower lr."
-            ),
-            stacklevel=2,
-        )
         print('='*20 + 'WARNING' + '='*20)
-    network = LycorisNetwork(
-        text_encoder, unet,
-        multiplier=multiplier,
-        lora_dim=network_dim, conv_lora_dim=conv_dim,
-        alpha=network_alpha, conv_alpha=conv_alpha,
-        dropout=dropout,
-        use_cp=(not bool(disable_cp)),
-        network_module=network_module
-    )
     return network
@@ -86,8 +105,9 @@ class LycorisNetwork(torch.nn.Module):
         multiplier=1.0,
         lora_dim=4, conv_lora_dim=4,
         alpha=1, conv_alpha=1,
-        use_cp = True,
         dropout = 0, network_module = LoConModule,
     ) -> None:
         super().__init__()
         self.multiplier = multiplier
@@ -124,19 +144,25 @@ class LycorisNetwork(torch.nn.Module):
                         if child_module.__class__.__name__ == 'Linear' and lora_dim>0:
                             lora = network_module(
                                 lora_name, child_module, self.multiplier,
-                                self.lora_dim, self.alpha, self.dropout, use_cp
                             )
                         elif child_module.__class__.__name__ == 'Conv2d':
                             k_size, *_ = child_module.kernel_size
                             if k_size==1 and lora_dim>0:
                                 lora = network_module(
                                     lora_name, child_module, self.multiplier,
-                                    self.lora_dim, self.alpha, self.dropout, use_cp
                                 )
                             elif conv_lora_dim>0:
                                 lora = network_module(
                                     lora_name, child_module, self.multiplier,
-                                    self.conv_lora_dim, self.conv_alpha, self.dropout, use_cp
                                 )
                             else:
                                 continue
@@ -149,19 +175,25 @@ class LycorisNetwork(torch.nn.Module):
                     if module.__class__.__name__ == 'Linear' and lora_dim>0:
                         lora = network_module(
                             lora_name, module, self.multiplier,
-                            self.lora_dim, self.alpha, self.dropout, use_cp
                         )
                     elif module.__class__.__name__ == 'Conv2d':
                         k_size, *_ = module.kernel_size
                         if k_size==1 and lora_dim>0:
                             lora = network_module(
                                 lora_name, module, self.multiplier,
-                                self.lora_dim, self.alpha, self.dropout, use_cp
                             )
                         elif conv_lora_dim>0:
                             lora = network_module(
                                 lora_name, module, self.multiplier,
-                                self.conv_lora_dim, self.conv_alpha, self.dropout, use_cp
                             )
                         else:
                             continue
@@ -306,3 +338,205 @@ class LycorisNetwork(torch.nn.Module):
             save_file(state_dict, file, metadata)
         else:
             torch.save(state_dict, file)

 from .kohya_utils import *
 from .locon import LoConModule
 from .loha import LohaModule
+from .ia3 import IA3Module
+from .lokr import LokrModule
+from .dylora import DyLoraModule
 def create_network(multiplier, network_dim, network_alpha, vae, text_encoder, unet, **kwargs):
     conv_dim = int(kwargs.get('conv_dim', network_dim))
     conv_alpha = float(kwargs.get('conv_alpha', network_alpha))
     dropout = float(kwargs.get('dropout', 0.))
+    algo = kwargs.get('algo', 'lora').lower()
+    use_cp = (not kwargs.get('disable_conv_cp', True)
+              or kwargs.get('use_conv_cp', False))
+    block_size = int(kwargs.get('block_size', 4))
     network_module = {
         'lora': LoConModule,
+        'locon': LoConModule,
         'loha': LohaModule,
+        'ia3':  IA3Module,
+        'lokr': LokrModule,
+        'dylora': DyLoraModule,
     }[algo]
     print(f'Using rank adaptation algo: {algo}')
+    if ((algo == 'loha' or algo == 'lokr')
         and not kwargs.get('no_dim_warn', False)
         and (network_dim>64 or conv_dim>64)):
         print('='*20 + 'WARNING' + '='*20)
+        warning_type ={
+            'loha': "Hadamard Product representation",
+            'lokr': "Kronecker Product representation"
+        }
+        warning_msg = f"""You are not supposed to use dim>64 (64*64 = 4096, it already has enough rank)\n
+            in {warning_type[algo]}!\n
+            Please consider use lower dim or disable this warning with --network_args no_dim_warn=True\n
+            If you just want to use high dim {algo}, please consider use lower lr.
+        """
+        warn(warning_msg, stacklevel=2)
         print('='*20 + 'WARNING' + '='*20)
+    if algo == 'ia3':
+        network = IA3Network(
+            text_encoder, unet,
+            multiplier = multiplier,
+        )
+    else:
+        network = LycorisNetwork(
+            text_encoder, unet,
+            multiplier=multiplier,
+            lora_dim=network_dim, conv_lora_dim=conv_dim,
+            alpha=network_alpha, conv_alpha=conv_alpha,
+            dropout=dropout,
+            use_cp=use_cp,
+            network_module=network_module,
+            decompose_both=kwargs.get('decompose_both', False),
+            factor=kwargs.get('factor', -1),
+            block_size = block_size
+        )
     return network
         multiplier=1.0,
         lora_dim=4, conv_lora_dim=4,
         alpha=1, conv_alpha=1,
+        use_cp = False,
         dropout = 0, network_module = LoConModule,
+        **kwargs,
     ) -> None:
         super().__init__()
         self.multiplier = multiplier
                         if child_module.__class__.__name__ == 'Linear' and lora_dim>0:
                             lora = network_module(
                                 lora_name, child_module, self.multiplier,
+                                self.lora_dim, self.alpha,
+                                self.dropout, use_cp,
+                                **kwargs
                             )
                         elif child_module.__class__.__name__ == 'Conv2d':
                             k_size, *_ = child_module.kernel_size
                             if k_size==1 and lora_dim>0:
                                 lora = network_module(
                                     lora_name, child_module, self.multiplier,
+                                    self.lora_dim, self.alpha,
+                                    self.dropout, use_cp,
+                                    **kwargs
                                 )
                             elif conv_lora_dim>0:
                                 lora = network_module(
                                     lora_name, child_module, self.multiplier,
+                                    self.conv_lora_dim, self.conv_alpha,
+                                    self.dropout, use_cp,
+                                    **kwargs
                                 )
                             else:
                                 continue
                     if module.__class__.__name__ == 'Linear' and lora_dim>0:
                         lora = network_module(
                             lora_name, module, self.multiplier,
+                            self.lora_dim, self.alpha,
+                            self.dropout, use_cp,
+                            **kwargs
                         )
                     elif module.__class__.__name__ == 'Conv2d':
                         k_size, *_ = module.kernel_size
                         if k_size==1 and lora_dim>0:
                             lora = network_module(
                                 lora_name, module, self.multiplier,
+                                self.lora_dim, self.alpha,
+                                self.dropout, use_cp,
+                                **kwargs
                             )
                         elif conv_lora_dim>0:
                             lora = network_module(
                                 lora_name, module, self.multiplier,
+                                self.conv_lora_dim, self.conv_alpha,
+                                self.dropout, use_cp,
+                                **kwargs
                             )
                         else:
                             continue
             save_file(state_dict, file, metadata)
         else:
             torch.save(state_dict, file)
+class IA3Network(torch.nn.Module):
+    '''
+    IA3 network
+    '''
+    # Ignore proj_in or proj_out, their channels is only a few.
+    UNET_TARGET_REPLACE_MODULE = []
+    UNET_TARGET_REPLACE_NAME = ["to_k", "to_v", "ff.net.2"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = []
+    TEXT_ENCODER_TARGET_REPLACE_NAME= ["k_proj", "v_proj", "mlp.fc2"]
+    TRAIN_INPUT = ["mlp.fc2", "ff.net.2"]
+    LORA_PREFIX_UNET = 'lora_unet'
+    LORA_PREFIX_TEXT_ENCODER = 'lora_te'
+    def __init__(
+        self,
+        text_encoder, unet,
+        multiplier=1.0,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.multiplier = multiplier
+        # create module instances
+        def create_modules(
+            prefix,
+            root_module: torch.nn.Module,
+            target_replace_modules,
+            target_replace_names = [],
+            target_train_input = []
+        ) -> List[IA3Module]:
+            print('Create LyCORIS Module')
+            loras = []
+            for name, module in root_module.named_modules():
+                if module.__class__.__name__ in target_replace_modules:
+                    for child_name, child_module in module.named_modules():
+                        lora_name = prefix + '.' + name + '.' + child_name
+                        lora_name = lora_name.replace('.', '_')
+                        if child_module.__class__.__name__ in {'Linear', 'Conv2d'}:
+                            lora = IA3Module(
+                                lora_name, child_module, self.multiplier,
+                                name in target_train_input,
+                                **kwargs,
+                            )
+                            loras.append(lora)
+                elif any(i in name for i in target_replace_names):
+                    lora_name = prefix + '.' + name
+                    lora_name = lora_name.replace('.', '_')
+                    if module.__class__.__name__ in {'Linear', 'Conv2d'}:
+                        lora = IA3Module(
+                            lora_name, module, self.multiplier,
+                            name in target_train_input,
+                            **kwargs,
+                        )
+                        loras.append(lora)
+            return loras
+        self.text_encoder_loras = create_modules(
+            IA3Network.LORA_PREFIX_TEXT_ENCODER,
+            text_encoder,
+            IA3Network.TEXT_ENCODER_TARGET_REPLACE_MODULE,
+            IA3Network.TEXT_ENCODER_TARGET_REPLACE_NAME,
+            IA3Network.TRAIN_INPUT
+        )
+        print(f"create LyCORIS for Text Encoder: {len(self.text_encoder_loras)} modules.")
+        self.unet_loras = create_modules(
+            IA3Network.LORA_PREFIX_UNET,
+            unet,
+            IA3Network.UNET_TARGET_REPLACE_MODULE,
+            IA3Network.UNET_TARGET_REPLACE_NAME,
+            IA3Network.TRAIN_INPUT
+        )
+        print(f"create LyCORIS for U-Net: {len(self.unet_loras)} modules.")
+        self.weights_sd = None
+        # assertion
+        names = set()
+        for lora in self.text_encoder_loras + self.unet_loras:
+            assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
+            names.add(lora.lora_name)
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.multiplier = self.multiplier
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == '.safetensors':
+            from safetensors.torch import load_file, safe_open
+            self.weights_sd = load_file(file)
+        else:
+            self.weights_sd = torch.load(file, map_location='cpu')
+    def apply_to(self, text_encoder, unet, apply_text_encoder=None, apply_unet=None):
+        if self.weights_sd:
+            weights_has_text_encoder = weights_has_unet = False
+            for key in self.weights_sd.keys():
+                if key.startswith(LycorisNetwork.LORA_PREFIX_TEXT_ENCODER):
+                    weights_has_text_encoder = True
+                elif key.startswith(LycorisNetwork.LORA_PREFIX_UNET):
+                    weights_has_unet = True
+            if apply_text_encoder is None:
+                apply_text_encoder = weights_has_text_encoder
+            else:
+                assert apply_text_encoder == weights_has_text_encoder, f"text encoder weights: {weights_has_text_encoder} but text encoder flag: {apply_text_encoder} / 重みとText Encoderのフラグが矛盾しています"
+            if apply_unet is None:
+                apply_unet = weights_has_unet
+            else:
+                assert apply_unet == weights_has_unet, f"u-net weights: {weights_has_unet} but u-net flag: {apply_unet} / 重みとU-Netのフラグが矛盾しています"
+        else:
+            assert apply_text_encoder is not None and apply_unet is not None, f"internal error: flag not set"
+        if apply_text_encoder:
+            print("enable LyCORIS for text encoder")
+        else:
+            self.text_encoder_loras = []
+        if apply_unet:
+            print("enable LyCORIS for U-Net")
+        else:
+            self.unet_loras = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.apply_to()
+            self.add_module(lora.lora_name, lora)
+        if self.weights_sd:
+            # if some weights are not in state dict, it is ok because initial LoRA does nothing (lora_up is initialized by zeros)
+            info = self.load_state_dict(self.weights_sd, False)
+            print(f"weights are loaded: {info}")
+    def enable_gradient_checkpointing(self):
+        # not supported
+        def make_ckpt(module):
+            if isinstance(module, torch.nn.Module):
+                module.grad_ckpt = True
+        self.apply(make_ckpt)
+        pass
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr):
+        def enumerate_params(loras):
+            params = []
+            for lora in loras:
+                params.extend(lora.parameters())
+            return params
+        self.requires_grad_(True)
+        all_params = []
+        if self.text_encoder_loras:
+            param_data = {'params': enumerate_params(self.text_encoder_loras)}
+            if text_encoder_lr is not None:
+                param_data['lr'] = text_encoder_lr
+            all_params.append(param_data)
+        if self.unet_loras:
+            param_data = {'params': enumerate_params(self.unet_loras)}
+            if unet_lr is not None:
+                param_data['lr'] = unet_lr
+            all_params.append(param_data)
+        return all_params
+    def prepare_grad_etc(self, text_encoder, unet):
+        self.requires_grad_(True)
+    def on_epoch_start(self, text_encoder, unet):
+        self.train()
+    def get_trainable_params(self):
+        return self.parameters()
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+        state_dict = self.state_dict()
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+        if os.path.splitext(file)[1] == '.safetensors':
+            from safetensors.torch import save_file
+            # Precalculate model hashes to save time on indexing
+            if metadata is None:
+                metadata = {}
+            model_hash, legacy_hash = precalculate_safetensors_hashes(state_dict, metadata)
+            metadata["sshs_model_hash"] = model_hash
+            metadata["sshs_legacy_hash"] = legacy_hash
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)

lycoris/kohya_model_utils.py CHANGED Viewed

@@ -1,13 +1,10 @@
-'''
-https://github.com/kohya-ss/sd-scripts/blob/main/library/model_util.py
-'''
 # v1: split from train_db_fixed.py.
 # v2: support safetensors
 import math
 import os
 import torch
-from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextConfig
 from diffusers import AutoencoderKL, DDIMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from safetensors.torch import load_file, save_file
@@ -19,7 +16,7 @@ BETA_END = 0.0120
 UNET_PARAMS_MODEL_CHANNELS = 320
 UNET_PARAMS_CHANNEL_MULT = [1, 2, 4, 4]
 UNET_PARAMS_ATTENTION_RESOLUTIONS = [4, 2, 1]
-UNET_PARAMS_IMAGE_SIZE = 32  # unused
 UNET_PARAMS_IN_CHANNELS = 4
 UNET_PARAMS_OUT_CHANNELS = 4
 UNET_PARAMS_NUM_RES_BLOCKS = 2
@@ -48,596 +45,574 @@ DIFFUSERS_REF_MODEL_ID_V2 = "stabilityai/stable-diffusion-2-1"
 def shave_segments(path, n_shave_prefix_segments=1):
-  """
-  Removes segments. Positive values shave the first segments, negative shave the last segments.
-  """
-  if n_shave_prefix_segments >= 0:
-    return ".".join(path.split(".")[n_shave_prefix_segments:])
-  else:
-    return ".".join(path.split(".")[:n_shave_prefix_segments])
 def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-  """
-  Updates paths inside resnets to the new naming scheme (local renaming)
-  """
-  mapping = []
-  for old_item in old_list:
-    new_item = old_item.replace("in_layers.0", "norm1")
-    new_item = new_item.replace("in_layers.2", "conv1")
-    new_item = new_item.replace("out_layers.0", "norm2")
-    new_item = new_item.replace("out_layers.3", "conv2")
-    new_item = new_item.replace("emb_layers.1", "time_emb_proj")
-    new_item = new_item.replace("skip_connection", "conv_shortcut")
-    new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-    mapping.append({"old": old_item, "new": new_item})
-  return mapping
 def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
-  """
-  Updates paths inside resnets to the new naming scheme (local renaming)
-  """
-  mapping = []
-  for old_item in old_list:
-    new_item = old_item
-    new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-    new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-    mapping.append({"old": old_item, "new": new_item})
-  return mapping
 def renew_attention_paths(old_list, n_shave_prefix_segments=0):
-  """
-  Updates paths inside attentions to the new naming scheme (local renaming)
-  """
-  mapping = []
-  for old_item in old_list:
-    new_item = old_item
-    #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
-    #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
-    #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-    #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
-    #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-    mapping.append({"old": old_item, "new": new_item})
-  return mapping
 def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
-  """
-  Updates paths inside attentions to the new naming scheme (local renaming)
-  """
-  mapping = []
-  for old_item in old_list:
-    new_item = old_item
-    new_item = new_item.replace("norm.weight", "group_norm.weight")
-    new_item = new_item.replace("norm.bias", "group_norm.bias")
-    new_item = new_item.replace("q.weight", "query.weight")
-    new_item = new_item.replace("q.bias", "query.bias")
-    new_item = new_item.replace("k.weight", "key.weight")
-    new_item = new_item.replace("k.bias", "key.bias")
-    new_item = new_item.replace("v.weight", "value.weight")
-    new_item = new_item.replace("v.bias", "value.bias")
-    new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
-    new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
-    new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-    mapping.append({"old": old_item, "new": new_item})
-  return mapping
 def assign_to_checkpoint(
     paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
 ):
-  """
-  This does the final conversion step: take locally converted weights and apply a global renaming
-  to them. It splits attention layers, and takes into account additional replacements
-  that may arise.
-  Assigns the weights to the new checkpoint.
-  """
-  assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
-  # Splits the attention layers into three variables.
-  if attention_paths_to_split is not None:
-    for path, path_map in attention_paths_to_split.items():
-      old_tensor = old_checkpoint[path]
-      channels = old_tensor.shape[0] // 3
-      target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
-      num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
-      old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
-      query, key, value = old_tensor.split(channels // num_heads, dim=1)
-      checkpoint[path_map["query"]] = query.reshape(target_shape)
-      checkpoint[path_map["key"]] = key.reshape(target_shape)
-      checkpoint[path_map["value"]] = value.reshape(target_shape)
-  for path in paths:
-    new_path = path["new"]
-    # These have already been assigned
-    if attention_paths_to_split is not None and new_path in attention_paths_to_split:
-      continue
-    # Global renaming happens here
-    new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
-    new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
-    new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
-    if additional_replacements is not None:
-      for replacement in additional_replacements:
-        new_path = new_path.replace(replacement["old"], replacement["new"])
-    # proj_attn.weight has to be converted from conv 1D to linear
-    if "proj_attn.weight" in new_path:
-      checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
-    else:
-      checkpoint[new_path] = old_checkpoint[path["old"]]
 def conv_attn_to_linear(checkpoint):
-  keys = list(checkpoint.keys())
-  attn_keys = ["query.weight", "key.weight", "value.weight"]
-  for key in keys:
-    if ".".join(key.split(".")[-2:]) in attn_keys:
-      if checkpoint[key].ndim > 2:
-        checkpoint[key] = checkpoint[key][:, :, 0, 0]
-    elif "proj_attn.weight" in key:
-      if checkpoint[key].ndim > 2:
-        checkpoint[key] = checkpoint[key][:, :, 0]
 def linear_transformer_to_conv(checkpoint):
-  keys = list(checkpoint.keys())
-  tf_keys = ["proj_in.weight", "proj_out.weight"]
-  for key in keys:
-    if ".".join(key.split(".")[-2:]) in tf_keys:
-      if checkpoint[key].ndim == 2:
-        checkpoint[key] = checkpoint[key].unsqueeze(2).unsqueeze(2)
 def convert_ldm_unet_checkpoint(v2, checkpoint, config):
-  """
-  Takes a state dict and a config, and returns a converted checkpoint.
-  """
-  # extract state_dict for UNet
-  unet_state_dict = {}
-  unet_key = "model.diffusion_model."
-  keys = list(checkpoint.keys())
-  for key in keys:
-    if key.startswith(unet_key):
-      unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
-  new_checkpoint = {}
-  new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
-  new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
-  new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
-  new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
-  new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-  new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
-  new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-  new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-  new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-  new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
-  # Retrieves the keys for the input blocks only
-  num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
-  input_blocks = {
-      layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key]
-      for layer_id in range(num_input_blocks)
-  }
-  # Retrieves the keys for the middle blocks only
-  num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
-  middle_blocks = {
-      layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}." in key]
-      for layer_id in range(num_middle_blocks)
-  }
-  # Retrieves the keys for the output blocks only
-  num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
-  output_blocks = {
-      layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key]
-      for layer_id in range(num_output_blocks)
-  }
-  for i in range(1, num_input_blocks):
-    block_id = (i - 1) // (config["layers_per_block"] + 1)
-    layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
-    resnets = [
-        key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
-    ]
-    attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
-    if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-      new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-          f"input_blocks.{i}.0.op.weight"
-      )
-      new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-          f"input_blocks.{i}.0.op.bias"
-      )
-    paths = renew_resnet_paths(resnets)
-    meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
-    assign_to_checkpoint(
-        paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-    )
-    if len(attentions):
-      paths = renew_attention_paths(attentions)
-      meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
-      assign_to_checkpoint(
-          paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-      )
-  resnet_0 = middle_blocks[0]
-  attentions = middle_blocks[1]
-  resnet_1 = middle_blocks[2]
-  resnet_0_paths = renew_resnet_paths(resnet_0)
-  assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-  resnet_1_paths = renew_resnet_paths(resnet_1)
-  assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-  attentions_paths = renew_attention_paths(attentions)
-  meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
-  assign_to_checkpoint(
-      attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-  )
-  for i in range(num_output_blocks):
-    block_id = i // (config["layers_per_block"] + 1)
-    layer_in_block_id = i % (config["layers_per_block"] + 1)
-    output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-    output_block_list = {}
-    for layer in output_block_layers:
-      layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-      if layer_id in output_block_list:
-        output_block_list[layer_id].append(layer_name)
-      else:
-        output_block_list[layer_id] = [layer_name]
-    if len(output_block_list) > 1:
-      resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
-      attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
-      resnet_0_paths = renew_resnet_paths(resnets)
-      paths = renew_resnet_paths(resnets)
-      meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
-      assign_to_checkpoint(
-          paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-      )
-      # オリジナル：
-      # if ["conv.weight", "conv.bias"] in output_block_list.values():
-      #   index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
-      # biasとweightの順番に依存しないようにする：もっといいやり方がありそうだが
-      for l in output_block_list.values():
-        l.sort()
-      if ["conv.bias", "conv.weight"] in output_block_list.values():
-        index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
-        new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-            f"output_blocks.{i}.{index}.conv.bias"
-        ]
-        new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-            f"output_blocks.{i}.{index}.conv.weight"
-        ]
-        # Clear attentions as they have been attributed above.
-        if len(attentions) == 2:
-          attentions = []
-      if len(attentions):
-        paths = renew_attention_paths(attentions)
-        meta_path = {
-            "old": f"output_blocks.{i}.1",
-            "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
-        }
-        assign_to_checkpoint(
-            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-        )
-    else:
-      resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-      for path in resnet_0_paths:
-        old_path = ".".join(["output_blocks", str(i), path["old"]])
-        new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
-        new_checkpoint[new_path] = unet_state_dict[old_path]
-  # SDのv2では1*1のconv2dがlinearに変わっているので、linear->convに変換する
-  if v2:
-    linear_transformer_to_conv(new_checkpoint)
-  return new_checkpoint
 def convert_ldm_vae_checkpoint(checkpoint, config):
-  # extract state dict for VAE
-  vae_state_dict = {}
-  vae_key = "first_stage_model."
-  keys = list(checkpoint.keys())
-  for key in keys:
-    if key.startswith(vae_key):
-      vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
-  # if len(vae_state_dict) == 0:
-  #   # 渡されたcheckpointは.ckptから読み込んだcheckpointではなくvaeのstate_dict
-  #   vae_state_dict = checkpoint
-  new_checkpoint = {}
-  new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
-  new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
-  new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
-  new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
-  new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
-  new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
-  new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
-  new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
-  new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
-  new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
-  new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
-  new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
-  new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
-  new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-  new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
-  new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
-  # Retrieves the keys for the encoder down blocks only
-  num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
-  down_blocks = {
-      layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
-  }
-  # Retrieves the keys for the decoder up blocks only
-  num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
-  up_blocks = {
-      layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
-  }
-  for i in range(num_down_blocks):
-    resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
-    if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-      new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-          f"encoder.down.{i}.downsample.conv.weight"
-      )
-      new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-          f"encoder.down.{i}.downsample.conv.bias"
-      )
-    paths = renew_vae_resnet_paths(resnets)
-    meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-  mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
-  num_mid_res_blocks = 2
-  for i in range(1, num_mid_res_blocks + 1):
-    resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
-    paths = renew_vae_resnet_paths(resnets)
-    meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
     assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-  mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
-  paths = renew_vae_attention_paths(mid_attentions)
-  meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-  assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-  conv_attn_to_linear(new_checkpoint)
-  for i in range(num_up_blocks):
-    block_id = num_up_blocks - 1 - i
-    resnets = [
-        key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
-    ]
-    if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-      new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-          f"decoder.up.{block_id}.upsample.conv.weight"
-      ]
-      new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-          f"decoder.up.{block_id}.upsample.conv.bias"
-      ]
-    paths = renew_vae_resnet_paths(resnets)
-    meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
     assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-  mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
-  num_mid_res_blocks = 2
-  for i in range(1, num_mid_res_blocks + 1):
-    resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
-    paths = renew_vae_resnet_paths(resnets)
-    meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-  mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
-  paths = renew_vae_attention_paths(mid_attentions)
-  meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-  assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-  conv_attn_to_linear(new_checkpoint)
-  return new_checkpoint
 def create_unet_diffusers_config(v2):
-  """
-  Creates a config for the diffusers based on the config of the LDM model.
-  """
-  # unet_params = original_config.model.params.unet_config.params
-  block_out_channels = [UNET_PARAMS_MODEL_CHANNELS * mult for mult in UNET_PARAMS_CHANNEL_MULT]
-  down_block_types = []
-  resolution = 1
-  for i in range(len(block_out_channels)):
-    block_type = "CrossAttnDownBlock2D" if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS else "DownBlock2D"
-    down_block_types.append(block_type)
-    if i != len(block_out_channels) - 1:
-      resolution *= 2
-  up_block_types = []
-  for i in range(len(block_out_channels)):
-    block_type = "CrossAttnUpBlock2D" if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS else "UpBlock2D"
-    up_block_types.append(block_type)
-    resolution //= 2
-  config = dict(
-      sample_size=UNET_PARAMS_IMAGE_SIZE,
-      in_channels=UNET_PARAMS_IN_CHANNELS,
-      out_channels=UNET_PARAMS_OUT_CHANNELS,
-      down_block_types=tuple(down_block_types),
-      up_block_types=tuple(up_block_types),
-      block_out_channels=tuple(block_out_channels),
-      layers_per_block=UNET_PARAMS_NUM_RES_BLOCKS,
-      cross_attention_dim=UNET_PARAMS_CONTEXT_DIM if not v2 else V2_UNET_PARAMS_CONTEXT_DIM,
-      attention_head_dim=UNET_PARAMS_NUM_HEADS if not v2 else V2_UNET_PARAMS_ATTENTION_HEAD_DIM,
-  )
-  return config
 def create_vae_diffusers_config():
-  """
-  Creates a config for the diffusers based on the config of the LDM model.
-  """
-  # vae_params = original_config.model.params.first_stage_config.params.ddconfig
-  # _ = original_config.model.params.first_stage_config.params.embed_dim
-  block_out_channels = [VAE_PARAMS_CH * mult for mult in VAE_PARAMS_CH_MULT]
-  down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
-  up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
-  config = dict(
-      sample_size=VAE_PARAMS_RESOLUTION,
-      in_channels=VAE_PARAMS_IN_CHANNELS,
-      out_channels=VAE_PARAMS_OUT_CH,
-      down_block_types=tuple(down_block_types),
-      up_block_types=tuple(up_block_types),
-      block_out_channels=tuple(block_out_channels),
-      latent_channels=VAE_PARAMS_Z_CHANNELS,
-      layers_per_block=VAE_PARAMS_NUM_RES_BLOCKS,
-  )
-  return config
 def convert_ldm_clip_checkpoint_v1(checkpoint):
-  keys = list(checkpoint.keys())
-  text_model_dict = {}
-  for key in keys:
-    if key.startswith("cond_stage_model.transformer"):
-      text_model_dict[key[len("cond_stage_model.transformer."):]] = checkpoint[key]
-  return text_model_dict
 def convert_ldm_clip_checkpoint_v2(checkpoint, max_length):
-  # 嫌になるくらい違うぞ！
-  def convert_key(key):
-    if not key.startswith("cond_stage_model"):
-      return None
-    # common conversion
-    key = key.replace("cond_stage_model.model.transformer.", "text_model.encoder.")
-    key = key.replace("cond_stage_model.model.", "text_model.")
-    if "resblocks" in key:
-      # resblocks conversion
-      key = key.replace(".resblocks.", ".layers.")
-      if ".ln_" in key:
-        key = key.replace(".ln_", ".layer_norm")
-      elif ".mlp." in key:
-        key = key.replace(".c_fc.", ".fc1.")
-        key = key.replace(".c_proj.", ".fc2.")
-      elif '.attn.out_proj' in key:
-        key = key.replace(".attn.out_proj.", ".self_attn.out_proj.")
-      elif '.attn.in_proj' in key:
-        key = None                  # 特殊なので後で処理する
-      else:
-        raise ValueError(f"unexpected key in SD: {key}")
-    elif '.positional_embedding' in key:
-      key = key.replace(".positional_embedding", ".embeddings.position_embedding.weight")
-    elif '.text_projection' in key:
-      key = None    # 使われない???
-    elif '.logit_scale' in key:
-      key = None    # 使われない???
-    elif '.token_embedding' in key:
-      key = key.replace(".token_embedding.weight", ".embeddings.token_embedding.weight")
-    elif '.ln_final' in key:
-      key = key.replace(".ln_final", ".final_layer_norm")
-    return key
-  keys = list(checkpoint.keys())
-  new_sd = {}
-  for key in keys:
-    # remove resblocks 23
-    if '.resblocks.23.' in key:
-      continue
-    new_key = convert_key(key)
-    if new_key is None:
-      continue
-    new_sd[new_key] = checkpoint[key]
-  # attnの変換
-  for key in keys:
-    if '.resblocks.23.' in key:
-      continue
-    if '.resblocks' in key and '.attn.in_proj_' in key:
-      # 三つに分割
-      values = torch.chunk(checkpoint[key], 3)
-      key_suffix = ".weight" if "weight" in key else ".bias"
-      key_pfx = key.replace("cond_stage_model.model.transformer.resblocks.", "text_model.encoder.layers.")
-      key_pfx = key_pfx.replace("_weight", "")
-      key_pfx = key_pfx.replace("_bias", "")
-      key_pfx = key_pfx.replace(".attn.in_proj", ".self_attn.")
-      new_sd[key_pfx + "q_proj" + key_suffix] = values[0]
-      new_sd[key_pfx + "k_proj" + key_suffix] = values[1]
-      new_sd[key_pfx + "v_proj" + key_suffix] = values[2]
-  # rename or add position_ids
-  ANOTHER_POSITION_IDS_KEY = "text_model.encoder.text_model.embeddings.position_ids"
-  if ANOTHER_POSITION_IDS_KEY in new_sd:
-    # waifu diffusion v1.4
-    position_ids = new_sd[ANOTHER_POSITION_IDS_KEY]
-    del new_sd[ANOTHER_POSITION_IDS_KEY]
-  else:
-    position_ids = torch.Tensor([list(range(max_length))]).to(torch.int64)
-  new_sd["text_model.embeddings.position_ids"] = position_ids
-  return new_sd
 # endregion
@@ -645,540 +620,546 @@ def convert_ldm_clip_checkpoint_v2(checkpoint, max_length):
 # region Diffusers->StableDiffusion の変換コード
 # convert_diffusers_to_original_stable_diffusion をコピーして修正している（ASL 2.0）
 def conv_transformer_to_linear(checkpoint):
-  keys = list(checkpoint.keys())
-  tf_keys = ["proj_in.weight", "proj_out.weight"]
-  for key in keys:
-    if ".".join(key.split(".")[-2:]) in tf_keys:
-      if checkpoint[key].ndim > 2:
-        checkpoint[key] = checkpoint[key][:, :, 0, 0]
 def convert_unet_state_dict_to_sd(v2, unet_state_dict):
-  unet_conversion_map = [
-      # (stable-diffusion, HF Diffusers)
-      ("time_embed.0.weight", "time_embedding.linear_1.weight"),
-      ("time_embed.0.bias", "time_embedding.linear_1.bias"),
-      ("time_embed.2.weight", "time_embedding.linear_2.weight"),
-      ("time_embed.2.bias", "time_embedding.linear_2.bias"),
-      ("input_blocks.0.0.weight", "conv_in.weight"),
-      ("input_blocks.0.0.bias", "conv_in.bias"),
-      ("out.0.weight", "conv_norm_out.weight"),
-      ("out.0.bias", "conv_norm_out.bias"),
-      ("out.2.weight", "conv_out.weight"),
-      ("out.2.bias", "conv_out.bias"),
-  ]
-  unet_conversion_map_resnet = [
-      # (stable-diffusion, HF Diffusers)
-      ("in_layers.0", "norm1"),
-      ("in_layers.2", "conv1"),
-      ("out_layers.0", "norm2"),
-      ("out_layers.3", "conv2"),
-      ("emb_layers.1", "time_emb_proj"),
-      ("skip_connection", "conv_shortcut"),
-  ]
-  unet_conversion_map_layer = []
-  for i in range(4):
-      # loop over downblocks/upblocks
     for j in range(2):
-        # loop over resnets/attentions for downblocks
-      hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
-      sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
-      unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
-      if i < 3:
-        # no attention layers in down_blocks.3
-        hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
-        sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
-        unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
-    for j in range(3):
-      # loop over resnets/attentions for upblocks
-      hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
-      sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
-      unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
-      if i > 0:
-        # no attention layers in up_blocks.0
-        hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
-        sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
-        unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
-    if i < 3:
-      # no downsample in down_blocks.3
-      hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
-      sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
-      unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
-      # no upsample in up_blocks.3
-      hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
-      sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
-      unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
-  hf_mid_atn_prefix = "mid_block.attentions.0."
-  sd_mid_atn_prefix = "middle_block.1."
-  unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
-  for j in range(2):
-    hf_mid_res_prefix = f"mid_block.resnets.{j}."
-    sd_mid_res_prefix = f"middle_block.{2*j}."
-    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
-  # buyer beware: this is a *brittle* function,
-  # and correct output requires that all of these pieces interact in
-  # the exact order in which I have arranged them.
-  mapping = {k: k for k in unet_state_dict.keys()}
-  for sd_name, hf_name in unet_conversion_map:
-    mapping[hf_name] = sd_name
-  for k, v in mapping.items():
-    if "resnets" in k:
-      for sd_part, hf_part in unet_conversion_map_resnet:
-        v = v.replace(hf_part, sd_part)
-      mapping[k] = v
-  for k, v in mapping.items():
-    for sd_part, hf_part in unet_conversion_map_layer:
-      v = v.replace(hf_part, sd_part)
-    mapping[k] = v
-  new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
-  if v2:
-    conv_transformer_to_linear(new_state_dict)
-  return new_state_dict
 # ================#
 # VAE Conversion #
 # ================#
 def reshape_weight_for_sd(w):
     # convert HF linear weights to SD conv2d weights
-  return w.reshape(*w.shape, 1, 1)
 def convert_vae_state_dict(vae_state_dict):
-  vae_conversion_map = [
-      # (stable-diffusion, HF Diffusers)
-      ("nin_shortcut", "conv_shortcut"),
-      ("norm_out", "conv_norm_out"),
-      ("mid.attn_1.", "mid_block.attentions.0."),
-  ]
-  for i in range(4):
-    # down_blocks have two resnets
-    for j in range(2):
-      hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
-      sd_down_prefix = f"encoder.down.{i}.block.{j}."
-      vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
-    if i < 3:
-      hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
-      sd_downsample_prefix = f"down.{i}.downsample."
-      vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
-      hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
-      sd_upsample_prefix = f"up.{3-i}.upsample."
-      vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
-    # up_blocks have three resnets
-    # also, up blocks in hf are numbered in reverse from sd
-    for j in range(3):
-      hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
-      sd_up_prefix = f"decoder.up.{3-i}.block.{j}."
-      vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
-  # this part accounts for mid blocks in both the encoder and the decoder
-  for i in range(2):
-    hf_mid_res_prefix = f"mid_block.resnets.{i}."
-    sd_mid_res_prefix = f"mid.block_{i+1}."
-    vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
-  vae_conversion_map_attn = [
-      # (stable-diffusion, HF Diffusers)
-      ("norm.", "group_norm."),
-      ("q.", "query."),
-      ("k.", "key."),
-      ("v.", "value."),
-      ("proj_out.", "proj_attn."),
-  ]
-  mapping = {k: k for k in vae_state_dict.keys()}
-  for k, v in mapping.items():
-    for sd_part, hf_part in vae_conversion_map:
-      v = v.replace(hf_part, sd_part)
-    mapping[k] = v
-  for k, v in mapping.items():
-    if "attentions" in k:
-      for sd_part, hf_part in vae_conversion_map_attn:
-        v = v.replace(hf_part, sd_part)
-      mapping[k] = v
-  new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
-  weights_to_convert = ["q", "k", "v", "proj_out"]
-  for k, v in new_state_dict.items():
-    for weight_name in weights_to_convert:
-      if f"mid.attn_1.{weight_name}.weight" in k:
-        # print(f"Reshaping {k} for SD format")
-        new_state_dict[k] = reshape_weight_for_sd(v)
-  return new_state_dict
 # endregion
 # region 自作のモデル読み書きなど
 def is_safetensors(path):
-  return os.path.splitext(path)[1].lower() == '.safetensors'
-def load_checkpoint_with_text_encoder_conversion(ckpt_path):
-  # text encoderの格納形式が違うモデルに対応する ('text_model'がない)
-  TEXT_ENCODER_KEY_REPLACEMENTS = [
-      ('cond_stage_model.transformer.embeddings.', 'cond_stage_model.transformer.text_model.embeddings.'),
-      ('cond_stage_model.transformer.encoder.', 'cond_stage_model.transformer.text_model.encoder.'),
-      ('cond_stage_model.transformer.final_layer_norm.', 'cond_stage_model.transformer.text_model.final_layer_norm.')
-  ]
-  if is_safetensors(ckpt_path):
-    checkpoint = None
-    state_dict = load_file(ckpt_path, "cpu")
-  else:
-    checkpoint = torch.load(ckpt_path, map_location="cpu")
-    if "state_dict" in checkpoint:
-      state_dict = checkpoint["state_dict"]
     else:
-      state_dict = checkpoint
-      checkpoint = None
-  key_reps = []
-  for rep_from, rep_to in TEXT_ENCODER_KEY_REPLACEMENTS:
-    for key in state_dict.keys():
-      if key.startswith(rep_from):
-        new_key = rep_to + key[len(rep_from):]
-        key_reps.append((key, new_key))
-  for key, new_key in key_reps:
-    state_dict[new_key] = state_dict[key]
-    del state_dict[key]
-  return checkpoint, state_dict
 # TODO dtype指定の動作が怪しいので確認する text_encoderを指定形式で作れるか未確認
-def load_models_from_stable_diffusion_checkpoint(v2, ckpt_path, dtype=None):
-  _, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path)
-  if dtype is not None:
-    for k, v in state_dict.items():
-      if type(v) is torch.Tensor:
-        state_dict[k] = v.to(dtype)
-  # Convert the UNet2DConditionModel model.
-  unet_config = create_unet_diffusers_config(v2)
-  converted_unet_checkpoint = convert_ldm_unet_checkpoint(v2, state_dict, unet_config)
-  unet = UNet2DConditionModel(**unet_config)
-  info = unet.load_state_dict(converted_unet_checkpoint)
-  print("loading u-net:", info)
-  # Convert the VAE model.
-  vae_config = create_vae_diffusers_config()
-  converted_vae_checkpoint = convert_ldm_vae_checkpoint(state_dict, vae_config)
-  vae = AutoencoderKL(**vae_config)
-  info = vae.load_state_dict(converted_vae_checkpoint)
-  print("loading vae:", info)
-  # convert text_model
-  if v2:
-    converted_text_encoder_checkpoint = convert_ldm_clip_checkpoint_v2(state_dict, 77)
-    cfg = CLIPTextConfig(
-        vocab_size=49408,
-        hidden_size=1024,
-        intermediate_size=4096,
-        num_hidden_layers=23,
-        num_attention_heads=16,
-        max_position_embeddings=77,
-        hidden_act="gelu",
-        layer_norm_eps=1e-05,
-        dropout=0.0,
-        attention_dropout=0.0,
-        initializer_range=0.02,
-        initializer_factor=1.0,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        model_type="clip_text_model",
-        projection_dim=512,
-        torch_dtype="float32",
-        transformers_version="4.25.0.dev0",
-    )
-    text_model = CLIPTextModel._from_config(cfg)
-    info = text_model.load_state_dict(converted_text_encoder_checkpoint)
-  else:
-    converted_text_encoder_checkpoint = convert_ldm_clip_checkpoint_v1(state_dict)
-    text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
-    info = text_model.load_state_dict(converted_text_encoder_checkpoint)
-  print("loading text encoder:", info)
-  return text_model, vae, unet
-def convert_text_encoder_state_dict_to_sd_v2(checkpoint, make_dummy_weights=False):
-  def convert_key(key):
-    # position_idsの除去
-    if ".position_ids" in key:
-      return None
-    # common
-    key = key.replace("text_model.encoder.", "transformer.")
-    key = key.replace("text_model.", "")
-    if "layers" in key:
-      # resblocks conversion
-      key = key.replace(".layers.", ".resblocks.")
-      if ".layer_norm" in key:
-        key = key.replace(".layer_norm", ".ln_")
-      elif ".mlp." in key:
-        key = key.replace(".fc1.", ".c_fc.")
-        key = key.replace(".fc2.", ".c_proj.")
-      elif '.self_attn.out_proj' in key:
-        key = key.replace(".self_attn.out_proj.", ".attn.out_proj.")
-      elif '.self_attn.' in key:
-        key = None                  # 特殊なので後で処理する
-      else:
-        raise ValueError(f"unexpected key in DiffUsers model: {key}")
-    elif '.position_embedding' in key:
-      key = key.replace("embeddings.position_embedding.weight", "positional_embedding")
-    elif '.token_embedding' in key:
-      key = key.replace("embeddings.token_embedding.weight", "token_embedding.weight")
-    elif 'final_layer_norm' in key:
-      key = key.replace("final_layer_norm", "ln_final")
-    return key
-  keys = list(checkpoint.keys())
-  new_sd = {}
-  for key in keys:
-    new_key = convert_key(key)
-    if new_key is None:
-      continue
-    new_sd[new_key] = checkpoint[key]
-  # attnの変換
-  for key in keys:
-    if 'layers' in key and 'q_proj' in key:
-      # 三つを結合
-      key_q = key
-      key_k = key.replace("q_proj", "k_proj")
-      key_v = key.replace("q_proj", "v_proj")
-      value_q = checkpoint[key_q]
-      value_k = checkpoint[key_k]
-      value_v = checkpoint[key_v]
-      value = torch.cat([value_q, value_k, value_v])
-      new_key = key.replace("text_model.encoder.layers.", "transformer.resblocks.")
-      new_key = new_key.replace(".self_attn.q_proj.", ".attn.in_proj_")
-      new_sd[new_key] = value
-  # 最後の層などを捏造するか
-  if make_dummy_weights:
-    print("make dummy weights for resblock.23, text_projection and logit scale.")
-    keys = list(new_sd.keys())
-    for key in keys:
-      if key.startswith("transformer.resblocks.22."):
-        new_sd[key.replace(".22.", ".23.")] = new_sd[key].clone()          # copyしないとsafetensorsの保存で落ちる
-    # Diffusersに含まれない重みを作っておく
-    new_sd['text_projection'] = torch.ones((1024, 1024), dtype=new_sd[keys[0]].dtype, device=new_sd[keys[0]].device)
-    new_sd['logit_scale'] = torch.tensor(1)
-  return new_sd
-def save_stable_diffusion_checkpoint(v2, output_file, text_encoder, unet, ckpt_path, epochs, steps, save_dtype=None, vae=None):
-  if ckpt_path is not None:
-    # epoch/stepを参照する。またVAEがメモリ上にないときなど、もう一度VAEを含めて読み込む
-    checkpoint, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path)
-    if checkpoint is None:                # safetensors または state_dictのckpt
-      checkpoint = {}
-      strict = False
-    else:
-      strict = True
-    if "state_dict" in state_dict:
-      del state_dict["state_dict"]
-  else:
-    # 新しく作る
-    assert vae is not None, "VAE is required to save a checkpoint without a given checkpoint"
-    checkpoint = {}
-    state_dict = {}
-    strict = False
-  def update_sd(prefix, sd):
-    for k, v in sd.items():
-      key = prefix + k
-      assert not strict or key in state_dict, f"Illegal key in save SD: {key}"
-      if save_dtype is not None:
-        v = v.detach().clone().to("cpu").to(save_dtype)
-      state_dict[key] = v
-  # Convert the UNet model
-  unet_state_dict = convert_unet_state_dict_to_sd(v2, unet.state_dict())
-  update_sd("model.diffusion_model.", unet_state_dict)
-  # Convert the text encoder model
-  if v2:
-    make_dummy = ckpt_path is None                 # 参照元のcheckpointがない場合は最後の層を前の層から複製して作るなどダミーの重みを入れる
-    text_enc_dict = convert_text_encoder_state_dict_to_sd_v2(text_encoder.state_dict(), make_dummy)
-    update_sd("cond_stage_model.model.", text_enc_dict)
-  else:
-    text_enc_dict = text_encoder.state_dict()
-    update_sd("cond_stage_model.transformer.", text_enc_dict)
-  # Convert the VAE
-  if vae is not None:
-    vae_dict = convert_vae_state_dict(vae.state_dict())
-    update_sd("first_stage_model.", vae_dict)
-  # Put together new checkpoint
-  key_count = len(state_dict.keys())
-  new_ckpt = {'state_dict': state_dict}
-  if 'epoch' in checkpoint:
-    epochs += checkpoint['epoch']
-  if 'global_step' in checkpoint:
-    steps += checkpoint['global_step']
-  new_ckpt['epoch'] = epochs
-  new_ckpt['global_step'] = steps
-  if is_safetensors(output_file):
-    # TODO Tensor以外のdictの値を削除したほうがいいか
-    save_file(state_dict, output_file)
-  else:
-    torch.save(new_ckpt, output_file)
-  return key_count
-def save_diffusers_checkpoint(v2, output_dir, text_encoder, unet, pretrained_model_name_or_path, vae=None, use_safetensors=False):
-  if pretrained_model_name_or_path is None:
-    # load default settings for v1/v2
-    if v2:
-      pretrained_model_name_or_path = DIFFUSERS_REF_MODEL_ID_V2
-    else:
-      pretrained_model_name_or_path = DIFFUSERS_REF_MODEL_ID_V1
-  scheduler = DDIMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
-  tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
-  if vae is None:
-    vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
-  pipeline = StableDiffusionPipeline(
-      unet=unet,
-      text_encoder=text_encoder,
-      vae=vae,
-      scheduler=scheduler,
-      tokenizer=tokenizer,
-      safety_checker=None,
-      feature_extractor=None,
-      requires_safety_checker=None,
-  )
-  pipeline.save_pretrained(output_dir, safe_serialization=use_safetensors)
-VAE_PREFIX = "first_stage_model."
-def load_vae(vae_id, dtype):
-  print(f"load VAE: {vae_id}")
-  if os.path.isdir(vae_id) or not os.path.isfile(vae_id):
-    # Diffusers local/remote
-    try:
-      vae = AutoencoderKL.from_pretrained(vae_id, subfolder=None, torch_dtype=dtype)
-    except EnvironmentError as e:
-      print(f"exception occurs in loading vae: {e}")
-      print("retry with subfolder='vae'")
-      vae = AutoencoderKL.from_pretrained(vae_id, subfolder="vae", torch_dtype=dtype)
-    return vae
-  # local
-  vae_config = create_vae_diffusers_config()
-  if vae_id.endswith(".bin"):
-    # SD 1.5 VAE on Huggingface
-    converted_vae_checkpoint = torch.load(vae_id, map_location="cpu")
-  else:
-    # StableDiffusion
-    vae_model = (load_file(vae_id, "cpu") if is_safetensors(vae_id)
-                 else torch.load(vae_id, map_location="cpu"))
-    vae_sd = vae_model['state_dict'] if 'state_dict' in vae_model else vae_model
-    # vae only or full model
-    full_model = False
-    for vae_key in vae_sd:
-      if vae_key.startswith(VAE_PREFIX):
-        full_model = True
-        break
-    if not full_model:
-      sd = {}
-      for key, value in vae_sd.items():
-        sd[VAE_PREFIX + key] = value
-      vae_sd = sd
-      del sd
-    # Convert the VAE model.
-    converted_vae_checkpoint = convert_ldm_vae_checkpoint(vae_sd, vae_config)
-  vae = AutoencoderKL(**vae_config)
-  vae.load_state_dict(converted_vae_checkpoint)
-  return vae
-# endregion
-def make_bucket_resolutions(max_reso, min_size=256, max_size=1024, divisible=64):
-  max_width, max_height = max_reso
-  max_area = (max_width // divisible) * (max_height // divisible)
-  resos = set()
-  size = int(math.sqrt(max_area)) * divisible
-  resos.add((size, size))
-  size = min_size
-  while size <= max_size:
-    width = size
-    height = min(max_size, (max_area // (width // divisible)) * divisible)
-    resos.add((width, height))
-    resos.add((height, width))
-    # # make additional resos
-    # if width >= height and width - divisible >= min_size:
-    #   resos.add((width - divisible, height))
-    #   resos.add((height, width - divisible))
-    # if height >= width and height - divisible >= min_size:
-    #   resos.add((width, height - divisible))
-    #   resos.add((height - divisible, width))
-    size += divisible
-  resos = list(resos)
-  resos.sort()
-  aspect_ratios = [w / h for w, h in resos]
-  return resos, aspect_ratios
-if __name__ == '__main__':
-  resos, aspect_ratios = make_bucket_resolutions((512, 768))
-  print(len(resos))
-  print(resos)
-  print(aspect_ratios)
-  ars = set()
-  for ar in aspect_ratios:
-    if ar in ars:
-      print("error! duplicate ar:", ar)
-    ars.add(ar)

 # v1: split from train_db_fixed.py.
 # v2: support safetensors
 import math
 import os
 import torch
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextConfig, logging
 from diffusers import AutoencoderKL, DDIMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from safetensors.torch import load_file, save_file
 UNET_PARAMS_MODEL_CHANNELS = 320
 UNET_PARAMS_CHANNEL_MULT = [1, 2, 4, 4]
 UNET_PARAMS_ATTENTION_RESOLUTIONS = [4, 2, 1]
+UNET_PARAMS_IMAGE_SIZE = 64  # fixed from old invalid value `32`
 UNET_PARAMS_IN_CHANNELS = 4
 UNET_PARAMS_OUT_CHANNELS = 4
 UNET_PARAMS_NUM_RES_BLOCKS = 2
 def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
 def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
 def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
 def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
 def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
 def assign_to_checkpoint(
     paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
 ):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path["new"]
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
 def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
 def linear_transformer_to_conv(checkpoint):
+    keys = list(checkpoint.keys())
+    tf_keys = ["proj_in.weight", "proj_out.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in tf_keys:
+            if checkpoint[key].ndim == 2:
+                checkpoint[key] = checkpoint[key].unsqueeze(2).unsqueeze(2)
 def convert_ldm_unet_checkpoint(v2, checkpoint, config):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    unet_key = "model.diffusion_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+    new_checkpoint = {}
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key] for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}." in key] for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key] for layer_id in range(num_output_blocks)
+    }
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+        resnets = [key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(f"input_blocks.{i}.0.op.bias")
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+            # オリジナル：
+            # if ["conv.weight", "conv.bias"] in output_block_list.values():
+            #   index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+            # biasとweightの順番に依存しないようにする：もっといいやり方がありそうだが
+            for l in output_block_list.values():
+                l.sort()
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    # SDのv2では1*1のconv2dがlinearに変わっているので、linear->convに変換する
+    if v2:
+        linear_transformer_to_conv(new_checkpoint)
+    return new_checkpoint
 def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+    # if len(vae_state_dict) == 0:
+    #   # 渡されたcheckpointは.ckptから読み込んだcheckpointではなくvaeのstate_dict
+    #   vae_state_dict = checkpoint
+    new_checkpoint = {}
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)}
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)}
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
     assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
 def create_unet_diffusers_config(v2):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    # unet_params = original_config.model.params.unet_config.params
+    block_out_channels = [UNET_PARAMS_MODEL_CHANNELS * mult for mult in UNET_PARAMS_CHANNEL_MULT]
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+    config = dict(
+        sample_size=UNET_PARAMS_IMAGE_SIZE,
+        in_channels=UNET_PARAMS_IN_CHANNELS,
+        out_channels=UNET_PARAMS_OUT_CHANNELS,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=UNET_PARAMS_NUM_RES_BLOCKS,
+        cross_attention_dim=UNET_PARAMS_CONTEXT_DIM if not v2 else V2_UNET_PARAMS_CONTEXT_DIM,
+        attention_head_dim=UNET_PARAMS_NUM_HEADS if not v2 else V2_UNET_PARAMS_ATTENTION_HEAD_DIM,
+    )
+    return config
 def create_vae_diffusers_config():
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    # vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    # _ = original_config.model.params.first_stage_config.params.embed_dim
+    block_out_channels = [VAE_PARAMS_CH * mult for mult in VAE_PARAMS_CH_MULT]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+    config = dict(
+        sample_size=VAE_PARAMS_RESOLUTION,
+        in_channels=VAE_PARAMS_IN_CHANNELS,
+        out_channels=VAE_PARAMS_OUT_CH,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=VAE_PARAMS_Z_CHANNELS,
+        layers_per_block=VAE_PARAMS_NUM_RES_BLOCKS,
+    )
+    return config
 def convert_ldm_clip_checkpoint_v1(checkpoint):
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+    return text_model_dict
 def convert_ldm_clip_checkpoint_v2(checkpoint, max_length):
+    # 嫌になるくらい違うぞ！
+    def convert_key(key):
+        if not key.startswith("cond_stage_model"):
+            return None
+        # common conversion
+        key = key.replace("cond_stage_model.model.transformer.", "text_model.encoder.")
+        key = key.replace("cond_stage_model.model.", "text_model.")
+        if "resblocks" in key:
+            # resblocks conversion
+            key = key.replace(".resblocks.", ".layers.")
+            if ".ln_" in key:
+                key = key.replace(".ln_", ".layer_norm")
+            elif ".mlp." in key:
+                key = key.replace(".c_fc.", ".fc1.")
+                key = key.replace(".c_proj.", ".fc2.")
+            elif ".attn.out_proj" in key:
+                key = key.replace(".attn.out_proj.", ".self_attn.out_proj.")
+            elif ".attn.in_proj" in key:
+                key = None  # 特殊なので後で処理する
+            else:
+                raise ValueError(f"unexpected key in SD: {key}")
+        elif ".positional_embedding" in key:
+            key = key.replace(".positional_embedding", ".embeddings.position_embedding.weight")
+        elif ".text_projection" in key:
+            key = None  # 使われない???
+        elif ".logit_scale" in key:
+            key = None  # 使われない???
+        elif ".token_embedding" in key:
+            key = key.replace(".token_embedding.weight", ".embeddings.token_embedding.weight")
+        elif ".ln_final" in key:
+            key = key.replace(".ln_final", ".final_layer_norm")
+        return key
+    keys = list(checkpoint.keys())
+    new_sd = {}
+    for key in keys:
+        # remove resblocks 23
+        if ".resblocks.23." in key:
+            continue
+        new_key = convert_key(key)
+        if new_key is None:
+            continue
+        new_sd[new_key] = checkpoint[key]
+    # attnの変換
+    for key in keys:
+        if ".resblocks.23." in key:
+            continue
+        if ".resblocks" in key and ".attn.in_proj_" in key:
+            # 三つに分割
+            values = torch.chunk(checkpoint[key], 3)
+            key_suffix = ".weight" if "weight" in key else ".bias"
+            key_pfx = key.replace("cond_stage_model.model.transformer.resblocks.", "text_model.encoder.layers.")
+            key_pfx = key_pfx.replace("_weight", "")
+            key_pfx = key_pfx.replace("_bias", "")
+            key_pfx = key_pfx.replace(".attn.in_proj", ".self_attn.")
+            new_sd[key_pfx + "q_proj" + key_suffix] = values[0]
+            new_sd[key_pfx + "k_proj" + key_suffix] = values[1]
+            new_sd[key_pfx + "v_proj" + key_suffix] = values[2]
+    # rename or add position_ids
+    ANOTHER_POSITION_IDS_KEY = "text_model.encoder.text_model.embeddings.position_ids"
+    if ANOTHER_POSITION_IDS_KEY in new_sd:
+        # waifu diffusion v1.4
+        position_ids = new_sd[ANOTHER_POSITION_IDS_KEY]
+        del new_sd[ANOTHER_POSITION_IDS_KEY]
+    else:
+        position_ids = torch.Tensor([list(range(max_length))]).to(torch.int64)
+    new_sd["text_model.embeddings.position_ids"] = position_ids
+    return new_sd
 # endregion
 # region Diffusers->StableDiffusion の変換コード
 # convert_diffusers_to_original_stable_diffusion をコピーして修正している（ASL 2.0）
 def conv_transformer_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    tf_keys = ["proj_in.weight", "proj_out.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in tf_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
 def convert_unet_state_dict_to_sd(v2, unet_state_dict):
+    unet_conversion_map = [
+        # (stable-diffusion, HF Diffusers)
+        ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+        ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+        ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+        ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+        ("input_blocks.0.0.weight", "conv_in.weight"),
+        ("input_blocks.0.0.bias", "conv_in.bias"),
+        ("out.0.weight", "conv_norm_out.weight"),
+        ("out.0.bias", "conv_norm_out.bias"),
+        ("out.2.weight", "conv_out.weight"),
+        ("out.2.bias", "conv_out.bias"),
+    ]
+    unet_conversion_map_resnet = [
+        # (stable-diffusion, HF Diffusers)
+        ("in_layers.0", "norm1"),
+        ("in_layers.2", "conv1"),
+        ("out_layers.0", "norm2"),
+        ("out_layers.3", "conv2"),
+        ("emb_layers.1", "time_emb_proj"),
+        ("skip_connection", "conv_shortcut"),
+    ]
+    unet_conversion_map_layer = []
+    for i in range(4):
+        # loop over downblocks/upblocks
+        for j in range(2):
+            # loop over resnets/attentions for downblocks
+            hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+            sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+            unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+            if i < 3:
+                # no attention layers in down_blocks.3
+                hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+                sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
+                unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+        for j in range(3):
+            # loop over resnets/attentions for upblocks
+            hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+            sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
+            unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+            if i > 0:
+                # no attention layers in up_blocks.0
+                hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+                sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
+                unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+        if i < 3:
+            # no downsample in down_blocks.3
+            hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+            sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
+            unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+            # no upsample in up_blocks.3
+            hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+            sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
+            unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+    hf_mid_atn_prefix = "mid_block.attentions.0."
+    sd_mid_atn_prefix = "middle_block.1."
+    unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
     for j in range(2):
+        hf_mid_res_prefix = f"mid_block.resnets.{j}."
+        sd_mid_res_prefix = f"middle_block.{2*j}."
+        unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
+    if v2:
+        conv_transformer_to_linear(new_state_dict)
+    return new_state_dict
 # ================#
 # VAE Conversion #
 # ================#
 def reshape_weight_for_sd(w):
     # convert HF linear weights to SD conv2d weights
+    return w.reshape(*w.shape, 1, 1)
 def convert_vae_state_dict(vae_state_dict):
+    vae_conversion_map = [
+        # (stable-diffusion, HF Diffusers)
+        ("nin_shortcut", "conv_shortcut"),
+        ("norm_out", "conv_norm_out"),
+        ("mid.attn_1.", "mid_block.attentions.0."),
+    ]
+    for i in range(4):
+        # down_blocks have two resnets
+        for j in range(2):
+            hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
+            sd_down_prefix = f"encoder.down.{i}.block.{j}."
+            vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
+        if i < 3:
+            hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
+            sd_downsample_prefix = f"down.{i}.downsample."
+            vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
+            hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+            sd_upsample_prefix = f"up.{3-i}.upsample."
+            vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
+        # up_blocks have three resnets
+        # also, up blocks in hf are numbered in reverse from sd
+        for j in range(3):
+            hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
+            sd_up_prefix = f"decoder.up.{3-i}.block.{j}."
+            vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
+    # this part accounts for mid blocks in both the encoder and the decoder
+    for i in range(2):
+        hf_mid_res_prefix = f"mid_block.resnets.{i}."
+        sd_mid_res_prefix = f"mid.block_{i+1}."
+        vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
+    vae_conversion_map_attn = [
+        # (stable-diffusion, HF Diffusers)
+        ("norm.", "group_norm."),
+        ("q.", "query."),
+        ("k.", "key."),
+        ("v.", "value."),
+        ("proj_out.", "proj_attn."),
+    ]
+    mapping = {k: k for k in vae_state_dict.keys()}
+    for k, v in mapping.items():
+        for sd_part, hf_part in vae_conversion_map:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    for k, v in mapping.items():
+        if "attentions" in k:
+            for sd_part, hf_part in vae_conversion_map_attn:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
+    weights_to_convert = ["q", "k", "v", "proj_out"]
+    for k, v in new_state_dict.items():
+        for weight_name in weights_to_convert:
+            if f"mid.attn_1.{weight_name}.weight" in k:
+                # print(f"Reshaping {k} for SD format")
+                new_state_dict[k] = reshape_weight_for_sd(v)
+    return new_state_dict
 # endregion
 # region 自作のモデル読み書きなど
 def is_safetensors(path):
+    return os.path.splitext(path)[1].lower() == ".safetensors"
+def load_checkpoint_with_text_encoder_conversion(ckpt_path, device="cpu"):
+    # text encoderの格納形式が違うモデルに対応する ('text_model'がない)
+    TEXT_ENCODER_KEY_REPLACEMENTS = [
+        ("cond_stage_model.transformer.embeddings.", "cond_stage_model.transformer.text_model.embeddings."),
+        ("cond_stage_model.transformer.encoder.", "cond_stage_model.transformer.text_model.encoder."),
+        ("cond_stage_model.transformer.final_layer_norm.", "cond_stage_model.transformer.text_model.final_layer_norm."),
+    ]
+    if is_safetensors(ckpt_path):
+        checkpoint = None
+        state_dict = load_file(ckpt_path)  # , device) # may causes error
     else:
+        checkpoint = torch.load(ckpt_path, map_location=device)
+        if "state_dict" in checkpoint:
+            state_dict = checkpoint["state_dict"]
+        else:
+            state_dict = checkpoint
+            checkpoint = None
+    key_reps = []
+    for rep_from, rep_to in TEXT_ENCODER_KEY_REPLACEMENTS:
+        for key in state_dict.keys():
+            if key.startswith(rep_from):
+                new_key = rep_to + key[len(rep_from) :]
+                key_reps.append((key, new_key))
+    for key, new_key in key_reps:
+        state_dict[new_key] = state_dict[key]
+        del state_dict[key]
+    return checkpoint, state_dict
 # TODO dtype指定の動作が怪しいので確認する text_encoderを指定形式で作れるか未確認
+def load_models_from_stable_diffusion_checkpoint(v2, ckpt_path, device="cpu", dtype=None):
+    _, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path, device)
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(v2)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(v2, state_dict, unet_config)
+    unet = UNet2DConditionModel(**unet_config).to(device)
+    info = unet.load_state_dict(converted_unet_checkpoint)
+    print("loading u-net:", info)
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config()
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(state_dict, vae_config)
+    vae = AutoencoderKL(**vae_config).to(device)
+    info = vae.load_state_dict(converted_vae_checkpoint)
+    print("loading vae:", info)
+    # convert text_model
+    if v2:
+        converted_text_encoder_checkpoint = convert_ldm_clip_checkpoint_v2(state_dict, 77)
+        cfg = CLIPTextConfig(
+            vocab_size=49408,
+            hidden_size=1024,
+            intermediate_size=4096,
+            num_hidden_layers=23,
+            num_attention_heads=16,
+            max_position_embeddings=77,
+            hidden_act="gelu",
+            layer_norm_eps=1e-05,
+            dropout=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=1.0,
+            pad_token_id=1,
+            bos_token_id=0,
+            eos_token_id=2,
+            model_type="clip_text_model",
+            projection_dim=512,
+            torch_dtype="float32",
+            transformers_version="4.25.0.dev0",
+        )
+        text_model = CLIPTextModel._from_config(cfg)
+        info = text_model.load_state_dict(converted_text_encoder_checkpoint)
+    else:
+        converted_text_encoder_checkpoint = convert_ldm_clip_checkpoint_v1(state_dict)
+        logging.set_verbosity_error()  # don't show annoying warning
+        text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
+        logging.set_verbosity_warning()
+        info = text_model.load_state_dict(converted_text_encoder_checkpoint)
+    print("loading text encoder:", info)
+    return text_model, vae, unet
+def convert_text_encoder_state_dict_to_sd_v2(checkpoint, make_dummy_weights=False):
+    def convert_key(key):
+        # position_idsの除去
+        if ".position_ids" in key:
+            return None
+        # common
+        key = key.replace("text_model.encoder.", "transformer.")
+        key = key.replace("text_model.", "")
+        if "layers" in key:
+            # resblocks conversion
+            key = key.replace(".layers.", ".resblocks.")
+            if ".layer_norm" in key:
+                key = key.replace(".layer_norm", ".ln_")
+            elif ".mlp." in key:
+                key = key.replace(".fc1.", ".c_fc.")
+                key = key.replace(".fc2.", ".c_proj.")
+            elif ".self_attn.out_proj" in key:
+                key = key.replace(".self_attn.out_proj.", ".attn.out_proj.")
+            elif ".self_attn." in key:
+                key = None  # 特殊なので後で処理する
+            else:
+                raise ValueError(f"unexpected key in DiffUsers model: {key}")
+        elif ".position_embedding" in key:
+            key = key.replace("embeddings.position_embedding.weight", "positional_embedding")
+        elif ".token_embedding" in key:
+            key = key.replace("embeddings.token_embedding.weight", "token_embedding.weight")
+        elif "final_layer_norm" in key:
+            key = key.replace("final_layer_norm", "ln_final")
+        return key
+    keys = list(checkpoint.keys())
+    new_sd = {}
+    for key in keys:
+        new_key = convert_key(key)
+        if new_key is None:
+            continue
+        new_sd[new_key] = checkpoint[key]
+    # attnの変換
+    for key in keys:
+        if "layers" in key and "q_proj" in key:
+            # 三つを結合
+            key_q = key
+            key_k = key.replace("q_proj", "k_proj")
+            key_v = key.replace("q_proj", "v_proj")
+            value_q = checkpoint[key_q]
+            value_k = checkpoint[key_k]
+            value_v = checkpoint[key_v]
+            value = torch.cat([value_q, value_k, value_v])
+            new_key = key.replace("text_model.encoder.layers.", "transformer.resblocks.")
+            new_key = new_key.replace(".self_attn.q_proj.", ".attn.in_proj_")
+            new_sd[new_key] = value
+    # 最後の層などを捏造するか
+    if make_dummy_weights:
+        print("make dummy weights for resblock.23, text_projection and logit scale.")
+        keys = list(new_sd.keys())
+        for key in keys:
+            if key.startswith("transformer.resblocks.22."):
+                new_sd[key.replace(".22.", ".23.")] = new_sd[key].clone()  # copyしないとsafetensorsの保存で落ちる
+        # Diffusersに含まれない重みを作っておく
+        new_sd["text_projection"] = torch.ones((1024, 1024), dtype=new_sd[keys[0]].dtype, device=new_sd[keys[0]].device)
+        new_sd["logit_scale"] = torch.tensor(1)
+    return new_sd
+def save_stable_diffusion_checkpoint(v2, output_file, text_encoder, unet, ckpt_path, epochs, steps, save_dtype=None, vae=None):
+    if ckpt_path is not None:
+        # epoch/stepを参照する。またVAEがメモリ上にないときなど、もう一度VAEを含めて読み込む
+        checkpoint, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path)
+        if checkpoint is None:  # safetensors または state_dictのckpt
+            checkpoint = {}
+            strict = False
+        else:
+            strict = True
+        if "state_dict" in state_dict:
+            del state_dict["state_dict"]
+    else:
+        # 新しく作る
+        assert vae is not None, "VAE is required to save a checkpoint without a given checkpoint"
+        checkpoint = {}
+        state_dict = {}
+        strict = False
+    def update_sd(prefix, sd):
+        for k, v in sd.items():
+            key = prefix + k
+            assert not strict or key in state_dict, f"Illegal key in save SD: {key}"
+            if save_dtype is not None:
+                v = v.detach().clone().to("cpu").to(save_dtype)
+            state_dict[key] = v
+    # Convert the UNet model
+    unet_state_dict = convert_unet_state_dict_to_sd(v2, unet.state_dict())
+    update_sd("model.diffusion_model.", unet_state_dict)
+    # Convert the text encoder model
+    if v2:
+        make_dummy = ckpt_path is None  # 参照元のcheckpointがない場合は最後の層を前の層から複製して作るなどダミーの重みを入れる
+        text_enc_dict = convert_text_encoder_state_dict_to_sd_v2(text_encoder.state_dict(), make_dummy)
+        update_sd("cond_stage_model.model.", text_enc_dict)
+    else:
+        text_enc_dict = text_encoder.state_dict()
+        update_sd("cond_stage_model.transformer.", text_enc_dict)
+    # Convert the VAE
+    if vae is not None:
+        vae_dict = convert_vae_state_dict(vae.state_dict())
+        update_sd("first_stage_model.", vae_dict)
+    # Put together new checkpoint
+    key_count = len(state_dict.keys())
+    new_ckpt = {"state_dict": state_dict}
+    # epoch and global_step are sometimes not int
+    try:
+        if "epoch" in checkpoint:
+            epochs += checkpoint["epoch"]
+        if "global_step" in checkpoint:
+            steps += checkpoint["global_step"]
+    except:
+        pass
+    new_ckpt["epoch"] = epochs
+    new_ckpt["global_step"] = steps
+    if is_safetensors(output_file):
+        # TODO Tensor以外のdictの値を削除したほうがいいか
+        save_file(state_dict, output_file)
+    else:
+        torch.save(new_ckpt, output_file)
+    return key_count
+def save_diffusers_checkpoint(v2, output_dir, text_encoder, unet, pretrained_model_name_or_path, vae=None, use_safetensors=False):
+    if pretrained_model_name_or_path is None:
+        # load default settings for v1/v2
+        if v2:
+            pretrained_model_name_or_path = DIFFUSERS_REF_MODEL_ID_V2
+        else:
+            pretrained_model_name_or_path = DIFFUSERS_REF_MODEL_ID_V1
+    scheduler = DDIMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
+    if vae is None:
+        vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
+    pipeline = StableDiffusionPipeline(
+        unet=unet,
+        text_encoder=text_encoder,
+        vae=vae,
+        scheduler=scheduler,
+        tokenizer=tokenizer,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=None,
+    )
+    pipeline.save_pretrained(output_dir, safe_serialization=use_safetensors)
+VAE_PREFIX = "first_stage_model."
+def load_vae(vae_id, dtype):
+    print(f"load VAE: {vae_id}")
+    if os.path.isdir(vae_id) or not os.path.isfile(vae_id):
+        # Diffusers local/remote
+        try:
+            vae = AutoencoderKL.from_pretrained(vae_id, subfolder=None, torch_dtype=dtype)
+        except EnvironmentError as e:
+            print(f"exception occurs in loading vae: {e}")
+            print("retry with subfolder='vae'")
+            vae = AutoencoderKL.from_pretrained(vae_id, subfolder="vae", torch_dtype=dtype)
+        return vae
+    # local
+    vae_config = create_vae_diffusers_config()
+    if vae_id.endswith(".bin"):
+        # SD 1.5 VAE on Huggingface
+        converted_vae_checkpoint = torch.load(vae_id, map_location="cpu")
+    else:
+        # StableDiffusion
+        vae_model = load_file(vae_id, "cpu") if is_safetensors(vae_id) else torch.load(vae_id, map_location="cpu")
+        vae_sd = vae_model["state_dict"] if "state_dict" in vae_model else vae_model
+        # vae only or full model
+        full_model = False
+        for vae_key in vae_sd:
+            if vae_key.startswith(VAE_PREFIX):
+                full_model = True
+                break
+        if not full_model:
+            sd = {}
+            for key, value in vae_sd.items():
+                sd[VAE_PREFIX + key] = value
+            vae_sd = sd
+            del sd
+        # Convert the VAE model.
+        converted_vae_checkpoint = convert_ldm_vae_checkpoint(vae_sd, vae_config)
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+    return vae
+# endregion
+def make_bucket_resolutions(max_reso, min_size=256, max_size=1024, divisible=64):
+    max_width, max_height = max_reso
+    max_area = (max_width // divisible) * (max_height // divisible)
+    resos = set()
+    size = int(math.sqrt(max_area)) * divisible
+    resos.add((size, size))
+    size = min_size
+    while size <= max_size:
+        width = size
+        height = min(max_size, (max_area // (width // divisible)) * divisible)
+        resos.add((width, height))
+        resos.add((height, width))
+        # # make additional resos
+        # if width >= height and width - divisible >= min_size:
+        #   resos.add((width - divisible, height))
+        #   resos.add((height, width - divisible))
+        # if height >= width and height - divisible >= min_size:
+        #   resos.add((width, height - divisible))
+        #   resos.add((height - divisible, width))
+        size += divisible
+    resos = list(resos)
+    resos.sort()
+    return resos
+if __name__ == "__main__":
+    resos = make_bucket_resolutions((512, 768))
+    print(len(resos))
+    print(resos)
+    aspect_ratios = [w / h for w, h in resos]
+    print(aspect_ratios)
+    ars = set()
+    for ar in aspect_ratios:
+        if ar in ars:
+            print("error! duplicate ar:", ar)
+        ars.add(ar)

lycoris/locon.py CHANGED Viewed

@@ -16,7 +16,8 @@ class LoConModule(nn.Module):
         multiplier=1.0,
         lora_dim=4, alpha=1,
         dropout=0.,
-        use_cp=True,
     ):
         """ if alpha == 0 or None, alpha is rank (no scaling). """
         super().__init__()

         multiplier=1.0,
         lora_dim=4, alpha=1,
         dropout=0.,
+        use_cp=False,
+        **kwargs,
     ):
         """ if alpha == 0 or None, alpha is rank (no scaling). """
         super().__init__()

lycoris/loha.py CHANGED Viewed

@@ -92,7 +92,8 @@ class LohaModule(nn.Module):
         lora_name,
         org_module: nn.Module,
         multiplier=1.0, lora_dim=4, alpha=1, dropout=0.,
-        use_cp=True,
     ):
         """ if alpha == 0 or None, alpha is rank (no scaling). """
         super().__init__()

         lora_name,
         org_module: nn.Module,
         multiplier=1.0, lora_dim=4, alpha=1, dropout=0.,
+        use_cp=False,
+        **kwargs,
     ):
         """ if alpha == 0 or None, alpha is rank (no scaling). """
         super().__init__()

lycoris/lokr.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# 4, build custom backward function
+#  -
+def factorization(dimension: int, factor:int=-1) -> tuple[int, int]:
+    '''
+    return a tuple of two value of input dimension decomposed by the number closest to factor
+    second value is higher or equal than first value.
+    In LoRA with Kroneckor Product, first value is a value for weight scale.
+    secon value is a value for weight.
+    Becuase of non-commutative property, A⊗B ≠ B⊗A. Meaning of two matrices is slightly different.
+    examples)
+    factor
+        -1               2                4               8               16               ...
+    127 -> 127, 1   127 -> 127, 1    127 -> 127, 1   127 -> 127, 1   127 -> 127, 1
+    128 -> 16, 8    128 -> 64, 2     128 -> 32, 4    128 -> 16, 8    128 -> 16, 8
+    250 -> 125, 2   250 -> 125, 2    250 -> 125, 2   250 -> 125, 2   250 -> 125, 2
+    360 -> 45, 8    360 -> 180, 2    360 -> 90, 4    360 -> 45, 8    360 -> 45, 8
+    512 -> 32, 16   512 -> 256, 2    512 -> 128, 4   512 -> 64, 8    512 -> 32, 16
+    1024 -> 32, 32  1024 -> 512, 2   1024 -> 256, 4  1024 -> 128, 8  1024 -> 64, 16
+    '''
+    if factor > 0 and (dimension % factor) == 0:
+        m = factor
+        n = dimension // factor
+        return m, n
+    if factor == -1:
+        factor = dimension
+    m, n = 1, dimension
+    length = m + n
+    while m<n:
+        new_m = m + 1
+        while dimension%new_m != 0:
+            new_m += 1
+        new_n = dimension // new_m
+        if new_m + new_n > length or new_m>factor:
+            break
+        else:
+            m, n = new_m, new_n
+    if m > n:
+        n, m = m, n
+    return m, n
+def make_weight_cp(t, wa, wb):
+    rebuild2 = torch.einsum('i j k l, i p, j r -> p r k l', t, wa, wb) # [c, d, k1, k2]
+    return rebuild2
+def make_kron(orig_weight, w1, w2, scale):
+    if len(w2.shape) == 4:
+        w1 = w1.unsqueeze(2).unsqueeze(2)
+    w2 = w2.contiguous()
+    return orig_weight + torch.kron(w1, w2).reshape(orig_weight.shape)*scale
+class LokrModule(nn.Module):
+    """
+    modifed from kohya-ss/sd-scripts/networks/lora:LoRAModule
+        and from KohakuBlueleaf/LyCORIS/lycoris:loha:LoHaModule
+        and from KohakuBlueleaf/LyCORIS/lycoris:locon:LoconModule
+    """
+    def __init__(
+        self,
+        lora_name, org_module: nn.Module,
+        multiplier=1.0,
+        lora_dim=4, alpha=1,
+        dropout=0.,
+        use_cp=False,
+        decompose_both = False,
+        factor:int=-1, # factorization factor
+        **kwargs,
+    ):
+        """ if alpha == 0 or None, alpha is rank (no scaling). """
+        super().__init__()
+        factor = int(factor)
+        self.lora_name = lora_name
+        self.lora_dim = lora_dim
+        self.cp = False
+        self.use_w1 = False
+        self.use_w2 = False
+        self.shape = org_module.weight.shape
+        if org_module.__class__.__name__ == 'Conv2d':
+            in_dim = org_module.in_channels
+            k_size = org_module.kernel_size
+            out_dim = org_module.out_channels
+            in_m, in_n = factorization(in_dim, factor)
+            out_l, out_k = factorization(out_dim, factor)
+            shape = ((out_l, out_k), (in_m, in_n), *k_size) # ((a, b), (c, d), *k_size)
+            self.cp = use_cp and k_size!=(1, 1)
+            if decompose_both and lora_dim < max(shape[0][0], shape[1][0])/2:
+                self.lokr_w1_a = nn.Parameter(torch.empty(shape[0][0], lora_dim))
+                self.lokr_w1_b = nn.Parameter(torch.empty(lora_dim, shape[1][0]))
+            else:
+                self.use_w1 = True
+                self.lokr_w1 = nn.Parameter(torch.empty(shape[0][0], shape[1][0]))  # a*c, 1-mode
+            if lora_dim >= max(shape[0][1], shape[1][1])/2:
+                self.use_w2 = True
+                self.lokr_w2 = nn.Parameter(torch.empty(shape[0][1], shape[1][1], *k_size))
+            elif self.cp:
+                self.lokr_t2 = nn.Parameter(torch.empty(lora_dim, lora_dim, shape[2], shape[3]))
+                self.lokr_w2_a = nn.Parameter(torch.empty(lora_dim, shape[0][1])) # b, 1-mode
+                self.lokr_w2_b = nn.Parameter(torch.empty(lora_dim, shape[1][1])) # d, 2-mode
+            else: # Conv2d not cp
+                # bigger part. weight and LoRA. [b, dim] x [dim, d*k1*k2]
+                self.lokr_w2_a = nn.Parameter(torch.empty(shape[0][1], lora_dim))
+                self.lokr_w2_b = nn.Parameter(torch.empty(lora_dim, shape[1][1]*shape[2]*shape[3]))
+                # w1 ⊗ (w2_a x w2_b) = (a, b)⊗((c, dim)x(dim, d*k1*k2)) = (a, b)⊗(c, d*k1*k2) = (ac, bd*k1*k2)
+            self.op = F.conv2d
+            self.extra_args = {
+                "stride": org_module.stride,
+                "padding": org_module.padding,
+                "dilation": org_module.dilation,
+                "groups": org_module.groups
+            }
+        else: # Linear
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+            in_m, in_n = factorization(in_dim, factor)
+            out_l, out_k = factorization(out_dim, factor)
+            shape = ((out_l, out_k), (in_m, in_n)) # ((a, b), (c, d)), out_dim = a*c, in_dim = b*d
+            # smaller part. weight scale
+            if decompose_both and lora_dim < max(shape[0][0], shape[1][0])/2:
+                self.lokr_w1_a = nn.Parameter(torch.empty(shape[0][0], lora_dim))
+                self.lokr_w1_b = nn.Parameter(torch.empty(lora_dim, shape[1][0]))
+            else:
+                self.use_w1 = True
+                self.lokr_w1 = nn.Parameter(torch.empty(shape[0][0], shape[1][0]))  # a*c, 1-mode
+            if lora_dim < max(shape[0][1], shape[1][1])/2:
+                # bigger part. weight and LoRA. [b, dim] x [dim, d]
+                self.lokr_w2_a = nn.Parameter(torch.empty(shape[0][1], lora_dim))
+                self.lokr_w2_b = nn.Parameter(torch.empty(lora_dim, shape[1][1]))
+                # w1 ⊗ (w2_a x w2_b) = (a, b)⊗((c, dim)x(dim, d)) = (a, b)⊗(c, d) = (ac, bd)
+            else:
+                self.use_w2 = True
+                self.lokr_w2 = nn.Parameter(torch.empty(shape[0][1], shape[1][1]))
+            self.op = F.linear
+            self.extra_args = {}
+        if dropout:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = nn.Identity()
+        if isinstance(alpha, torch.Tensor):
+            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        alpha = lora_dim if alpha is None or alpha == 0 else alpha
+        if self.use_w2 and self.use_w1:
+            #use scale = 1
+            alpha = lora_dim
+        self.scale = alpha / self.lora_dim
+        self.register_buffer('alpha', torch.tensor(alpha)) # 定数として扱える
+        if self.use_w2:
+            torch.nn.init.constant_(self.lokr_w2, 0)
+        else:
+            if self.cp:
+                torch.nn.init.normal_(self.lokr_t2, std=0.1)
+            torch.nn.init.normal_(self.lokr_w2_a, std=1)
+            torch.nn.init.constant_(self.lokr_w2_b, 0)
+        if self.use_w1:
+            torch.nn.init.normal_(self.lokr_w1, std=1)
+        else:
+            torch.nn.init.normal_(self.lokr_w1_a, std=1)
+            torch.nn.init.normal_(self.lokr_w1_b, std=0.1)
+        self.multiplier = multiplier
+        self.org_module = [org_module]
+        weight = make_kron(
+            self.org_module[0].weight.data,
+            self.lokr_w1 if self.use_w1 else [email protected]_w1_b,
+            (self.lokr_w2 if self.use_w2
+             else make_weight_cp(self.lokr_t2, self.lokr_w2_a, self.lokr_w2_b) if self.cp
+             else [email protected]_w2_b),
+            torch.tensor(self.multiplier * self.scale)
+        )
+        assert torch.sum(torch.isnan(weight)) == 0, "weight is nan"
+    # Same as locon.py
+    def apply_to(self):
+        self.org_forward = self.org_module[0].forward
+        self.org_module[0].forward = self.forward
+    def forward(self, x):
+        weight = make_kron(
+            self.org_module[0].weight.data,
+            self.lokr_w1 if self.use_w1 else [email protected]_w1_b,
+            (self.lokr_w2 if self.use_w2
+             else make_weight_cp(self.lokr_t2, self.lokr_w2_a, self.lokr_w2_b) if self.cp
+             else [email protected]_w2_b),
+            torch.tensor(self.multiplier * self.scale)
+        )
+        bias = None if self.org_module[0].bias is None else self.org_module[0].bias.data
+        return self.op(
+            x,
+            weight.view(self.shape),
+            bias,
+            **self.extra_args
+        )

lycoris/utils.py CHANGED Viewed

@@ -24,6 +24,7 @@ def extract_conv(
     mode = 'fixed',
     mode_param = 0,
     device = 'cpu',
 ) -> Tuple[nn.Parameter, nn.Parameter]:
     weight = weight.to(device)
     out_ch, in_ch, kernel_size, _ = weight.shape
@@ -48,6 +49,8 @@ def extract_conv(
         raise NotImplementedError('Extract mode should be "fixed", "threshold", "ratio" or "quantile"')
     lora_rank = max(1, lora_rank)
     lora_rank = min(out_ch, in_ch, lora_rank)
     U = U[:, :lora_rank]
     S = S[:lora_rank]
@@ -58,29 +61,7 @@ def extract_conv(
     extract_weight_A = Vh.reshape(lora_rank, in_ch, kernel_size, kernel_size).detach()
     extract_weight_B = U.reshape(out_ch, lora_rank, 1, 1).detach()
     del U, S, Vh, weight
-    return extract_weight_A, extract_weight_B, diff
-def merge_conv(
-    weight_a: Union[torch.Tensor, nn.Parameter],
-    weight_b: Union[torch.Tensor, nn.Parameter],
-    device = 'cpu'
-):
-    rank, in_ch, kernel_size, k_ = weight_a.shape
-    out_ch, rank_, _, _ = weight_b.shape
-    assert rank == rank_ and kernel_size == k_
-    wa = weight_a.to(device)
-    wb = weight_b.to(device)
-    if device == 'cpu':
-        wa = wa.float()
-        wb = wb.float()
-    merged = wb.reshape(out_ch, -1) @ wa.reshape(rank, -1)
-    weight = merged.reshape(out_ch, in_ch, kernel_size, kernel_size)
-    del wb, wa
-    return weight
 def extract_linear(
@@ -112,6 +93,8 @@ def extract_linear(
         raise NotImplementedError('Extract mode should be "fixed", "threshold", "ratio" or "quantile"')
     lora_rank = max(1, lora_rank)
     lora_rank = min(out_ch, in_ch, lora_rank)
     U = U[:, :lora_rank]
     S = S[:lora_rank]
@@ -122,28 +105,7 @@ def extract_linear(
     extract_weight_A = Vh.reshape(lora_rank, in_ch).detach()
     extract_weight_B = U.reshape(out_ch, lora_rank).detach()
     del U, S, Vh, weight
-    return extract_weight_A, extract_weight_B, diff
-def merge_linear(
-    weight_a: Union[torch.Tensor, nn.Parameter],
-    weight_b: Union[torch.Tensor, nn.Parameter],
-    device = 'cpu'
-):
-    rank, in_ch = weight_a.shape
-    out_ch, rank_ = weight_b.shape
-    assert rank == rank_
-    wa = weight_a.to(device)
-    wb = weight_b.to(device)
-    if device == 'cpu':
-        wa = wa.float()
-        wb = wb.float()
-    weight = wb @ wa
-    del wb, wa
-    return weight
 def extract_diff(
@@ -200,30 +162,38 @@ def extract_diff(
                 for child_name, child_module in module.named_modules():
                     lora_name = prefix + '.' + name + '.' + child_name
                     lora_name = lora_name.replace('.', '_')
                     layer = child_module.__class__.__name__
                     if layer == 'Linear':
-                        extract_a, extract_b, diff = extract_linear(
                             (child_module.weight - weights[child_name]),
                             mode,
                             linear_mode_param,
                             device = extract_device,
                         )
                     elif layer == 'Conv2d':
                         is_linear = (child_module.weight.shape[2] == 1
                                      and child_module.weight.shape[3] == 1)
-                        extract_a, extract_b, diff = extract_conv(
                             (child_module.weight - weights[child_name]),
                             mode,
                             linear_mode_param if is_linear else conv_mode_param,
                             device = extract_device,
                         )
-                        if small_conv and not is_linear:
                             dim = extract_a.size(0)
-                            extract_c, extract_a, _ = extract_conv(
                                 extract_a.transpose(0, 1),
                                 'fixed', dim,
-                                extract_device
                             )
                             extract_a = extract_a.transpose(0, 1)
                             extract_c = extract_c.transpose(0, 1)
@@ -235,77 +205,92 @@ def extract_diff(
                             del extract_c
                     else:
                         continue
-                    loras[f'{lora_name}.lora_down.weight'] = extract_a.detach().cpu().contiguous().half()
-                    loras[f'{lora_name}.lora_up.weight'] = extract_b.detach().cpu().contiguous().half()
-                    loras[f'{lora_name}.alpha'] = torch.Tensor([extract_a.shape[0]]).half()
-                    if use_bias:
-                        diff = diff.detach().cpu().reshape(extract_b.size(0), -1)
-                        sparse_diff = make_sparse(diff, sparsity).to_sparse().coalesce()
-                        indices = sparse_diff.indices().to(torch.int16)
-                        values = sparse_diff.values().half()
-                        loras[f'{lora_name}.bias_indices'] = indices
-                        loras[f'{lora_name}.bias_values'] = values
-                        loras[f'{lora_name}.bias_size'] = torch.tensor(diff.shape).to(torch.int16)
-                    del extract_a, extract_b, diff
             elif name in temp_name:
-                weight = temp_name[name]
                 lora_name = prefix + '.' + name
                 lora_name = lora_name.replace('.', '_')
-                if weight.size(0)<32 or weight.size(1)<32:
-                    loras[f'{lora_name}.diff'] = module.weight - weight
-                    continue
-                layer = module.__class__.__name__
                 if layer == 'Linear':
-                    extract_a, extract_b, diff = extract_linear(
-                        (module.weight - weight),
                         mode,
                         linear_mode_param,
                         device = extract_device,
                     )
                 elif layer == 'Conv2d':
-                    is_linear = (module.weight.shape[2] == 1
-                                and module.weight.shape[3] == 1)
-                    extract_a, extract_b, diff = extract_conv(
-                        (module.weight - weight),
                         mode,
                         linear_mode_param if is_linear else conv_mode_param,
                         device = extract_device,
                     )
-                    if small_conv and not is_linear:
                         dim = extract_a.size(0)
-                        extract_c, extract_a, _ = extract_conv(
                             extract_a.transpose(0, 1),
                             'fixed', dim,
-                            extract_device
                         )
                         extract_a = extract_a.transpose(0, 1)
                         extract_c = extract_c.transpose(0, 1)
                         loras[f'{lora_name}.lora_mid.weight'] = extract_c.detach().cpu().contiguous().half()
-                        diff = module.weight - torch.einsum(
                             'i j k l, j r, p i -> p r k l',
                             extract_c, extract_a.flatten(1, -1), extract_b.flatten(1, -1)
                         ).detach().cpu().contiguous()
                         del extract_c
                 else:
                     continue
-                loras[f'{lora_name}.lora_down.weight'] = extract_a.detach().cpu().contiguous().half()
-                loras[f'{lora_name}.lora_up.weight'] = extract_b.detach().cpu().contiguous().half()
-                loras[f'{lora_name}.alpha'] = torch.Tensor([extract_a.shape[0]]).half()
-                if use_bias:
-                    diff = diff.detach().cpu().reshape(extract_b.size(0), -1)
-                    sparse_diff = make_sparse(diff, sparsity).to_sparse().coalesce()
-                    indices = sparse_diff.indices().to(torch.int16)
-                    values = sparse_diff.values().half()
-                    loras[f'{lora_name}.bias_indices'] = indices
-                    loras[f'{lora_name}.bias_values'] = values
-                    loras[f'{lora_name}.bias_size'] = torch.tensor(diff.shape).to(torch.int16)
-                del extract_a, extract_b, diff
         return loras
     text_encoder_loras = make_state_dict(
@@ -324,70 +309,125 @@ def extract_diff(
     return text_encoder_loras|unet_loras
-def merge_locon(
-    base_model,
-    locon_state_dict: Dict[str, torch.TensorType],
-    scale: float = 1.0,
-    device = 'cpu'
 ):
-    UNET_TARGET_REPLACE_MODULE = [
-        "Transformer2DModel",
-        "Attention",
-        "ResnetBlock2D",
-        "Downsample2D",
-        "Upsample2D"
-    ]
-    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"]
-    LORA_PREFIX_UNET = 'lora_unet'
-    LORA_PREFIX_TEXT_ENCODER = 'lora_te'
-    def merge(
-        prefix,
-        root_module: torch.nn.Module,
-        target_replace_modules
-    ):
-        temp = {}
-        for name, module in tqdm(list(root_module.named_modules())):
-            if module.__class__.__name__ in target_replace_modules:
-                temp[name] = {}
-                for child_name, child_module in module.named_modules():
-                    layer = child_module.__class__.__name__
-                    if layer not in {'Linear', 'Conv2d'}:
-                        continue
-                    lora_name = prefix + '.' + name + '.' + child_name
-                    lora_name = lora_name.replace('.', '_')
-                    down = locon_state_dict[f'{lora_name}.lora_down.weight'].float()
-                    up = locon_state_dict[f'{lora_name}.lora_up.weight'].float()
-                    alpha = locon_state_dict[f'{lora_name}.alpha'].float()
-                    rank = down.shape[0]
-                    if layer == 'Conv2d':
-                        delta = merge_conv(down, up, device)
-                        child_module.weight.requires_grad_(False)
-                        child_module.weight += (alpha.to(device)/rank * scale * delta).cpu()
-                        del delta
-                    elif layer == 'Linear':
-                        delta = merge_linear(down, up, device)
-                        child_module.weight.requires_grad_(False)
-                        child_module.weight += (alpha.to(device)/rank * scale * delta).cpu()
-                        del delta
-    merge(
-        LORA_PREFIX_TEXT_ENCODER,
-        base_model[0],
-        TEXT_ENCODER_TARGET_REPLACE_MODULE
-    )
-    merge(
-        LORA_PREFIX_UNET,
-        base_model[2],
-        UNET_TARGET_REPLACE_MODULE
-    )
-def merge_loha(
     base_model,
-    loha_state_dict: Dict[str, torch.TensorType],
     scale: float = 1.0,
     device = 'cpu'
 ):
@@ -398,51 +438,67 @@ def merge_loha(
         "Downsample2D",
         "Upsample2D"
     ]
     TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"]
     LORA_PREFIX_UNET = 'lora_unet'
     LORA_PREFIX_TEXT_ENCODER = 'lora_te'
-    def merge(
         prefix,
         root_module: torch.nn.Module,
-        target_replace_modules
     ):
-        temp = {}
-        for name, module in tqdm(list(root_module.named_modules())):
             if module.__class__.__name__ in target_replace_modules:
-                temp[name] = {}
                 for child_name, child_module in module.named_modules():
-                    layer = child_module.__class__.__name__
-                    if layer not in {'Linear', 'Conv2d'}:
                         continue
                     lora_name = prefix + '.' + name + '.' + child_name
                     lora_name = lora_name.replace('.', '_')
-                    w1a = loha_state_dict[f'{lora_name}.hada_w1_a'].float().to(device)
-                    w1b = loha_state_dict[f'{lora_name}.hada_w1_b'].float().to(device)
-                    w2a = loha_state_dict[f'{lora_name}.hada_w2_a'].float().to(device)
-                    w2b = loha_state_dict[f'{lora_name}.hada_w2_b'].float().to(device)
-                    alpha = loha_state_dict[f'{lora_name}.alpha'].float().to(device)
-                    dim = w1b.shape[0]
-                    delta = (w1a @ w1b) * (w2a @ w2b)
-                    delta = delta.reshape(child_module.weight.shape)
-                    if layer == 'Conv2d':
-                        child_module.weight.requires_grad_(False)
-                        child_module.weight += (alpha.to(device)/dim * scale * delta).cpu()
-                    elif layer == 'Linear':
-                        child_module.weight.requires_grad_(False)
-                        child_module.weight += (alpha.to(device)/dim * scale * delta).cpu()
-                    del delta
-    merge(
-        LORA_PREFIX_TEXT_ENCODER,
-        base_model[0],
-        TEXT_ENCODER_TARGET_REPLACE_MODULE
     )
-    merge(
         LORA_PREFIX_UNET,
-        base_model[2],
-        UNET_TARGET_REPLACE_MODULE
-    )

     mode = 'fixed',
     mode_param = 0,
     device = 'cpu',
+    is_cp = False,
 ) -> Tuple[nn.Parameter, nn.Parameter]:
     weight = weight.to(device)
     out_ch, in_ch, kernel_size, _ = weight.shape
         raise NotImplementedError('Extract mode should be "fixed", "threshold", "ratio" or "quantile"')
     lora_rank = max(1, lora_rank)
     lora_rank = min(out_ch, in_ch, lora_rank)
+    if lora_rank>=out_ch/2 and not is_cp:
+        return weight, 'full'
     U = U[:, :lora_rank]
     S = S[:lora_rank]
     extract_weight_A = Vh.reshape(lora_rank, in_ch, kernel_size, kernel_size).detach()
     extract_weight_B = U.reshape(out_ch, lora_rank, 1, 1).detach()
     del U, S, Vh, weight
+    return (extract_weight_A, extract_weight_B, diff), 'low rank'
 def extract_linear(
         raise NotImplementedError('Extract mode should be "fixed", "threshold", "ratio" or "quantile"')
     lora_rank = max(1, lora_rank)
     lora_rank = min(out_ch, in_ch, lora_rank)
+    if lora_rank>=out_ch/2:
+        return weight, 'full'
     U = U[:, :lora_rank]
     S = S[:lora_rank]
     extract_weight_A = Vh.reshape(lora_rank, in_ch).detach()
     extract_weight_B = U.reshape(out_ch, lora_rank).detach()
     del U, S, Vh, weight
+    return (extract_weight_A, extract_weight_B, diff), 'low rank'
 def extract_diff(
                 for child_name, child_module in module.named_modules():
                     lora_name = prefix + '.' + name + '.' + child_name
                     lora_name = lora_name.replace('.', '_')
                     layer = child_module.__class__.__name__
+                    if layer in {'Linear', 'Conv2d'}:
+                        root_weight = child_module.weight
+                        if torch.allclose(root_weight, weights[child_name]):
+                            continue
                     if layer == 'Linear':
+                        weight, decompose_mode = extract_linear(
                             (child_module.weight - weights[child_name]),
                             mode,
                             linear_mode_param,
                             device = extract_device,
                         )
+                        if decompose_mode == 'low rank':
+                            extract_a, extract_b, diff = weight
                     elif layer == 'Conv2d':
                         is_linear = (child_module.weight.shape[2] == 1
                                      and child_module.weight.shape[3] == 1)
+                        weight, decompose_mode = extract_conv(
                             (child_module.weight - weights[child_name]),
                             mode,
                             linear_mode_param if is_linear else conv_mode_param,
                             device = extract_device,
                         )
+                        if decompose_mode == 'low rank':
+                            extract_a, extract_b, diff = weight
+                        if small_conv and not is_linear and decompose_mode == 'low rank':
                             dim = extract_a.size(0)
+                            (extract_c, extract_a, _), _ = extract_conv(
                                 extract_a.transpose(0, 1),
                                 'fixed', dim,
+                                extract_device, True
                             )
                             extract_a = extract_a.transpose(0, 1)
                             extract_c = extract_c.transpose(0, 1)
                             del extract_c
                     else:
                         continue
+                    if decompose_mode == 'low rank':
+                        loras[f'{lora_name}.lora_down.weight'] = extract_a.detach().cpu().contiguous().half()
+                        loras[f'{lora_name}.lora_up.weight'] = extract_b.detach().cpu().contiguous().half()
+                        loras[f'{lora_name}.alpha'] = torch.Tensor([extract_a.shape[0]]).half()
+                        if use_bias:
+                            diff = diff.detach().cpu().reshape(extract_b.size(0), -1)
+                            sparse_diff = make_sparse(diff, sparsity).to_sparse().coalesce()
+                            indices = sparse_diff.indices().to(torch.int16)
+                            values = sparse_diff.values().half()
+                            loras[f'{lora_name}.bias_indices'] = indices
+                            loras[f'{lora_name}.bias_values'] = values
+                            loras[f'{lora_name}.bias_size'] = torch.tensor(diff.shape).to(torch.int16)
+                        del extract_a, extract_b, diff
+                    elif decompose_mode == 'full':
+                        loras[f'{lora_name}.diff'] = weight.detach().cpu().contiguous().half()
+                    else:
+                        raise NotImplementedError
             elif name in temp_name:
+                weights = temp_name[name]
                 lora_name = prefix + '.' + name
                 lora_name = lora_name.replace('.', '_')
+                layer = module.__class__.__name__
+                if layer in {'Linear', 'Conv2d'}:
+                    root_weight = module.weight
+                    if torch.allclose(root_weight, weights):
+                        continue
                 if layer == 'Linear':
+                    weight, decompose_mode = extract_linear(
+                        (root_weight - weights),
                         mode,
                         linear_mode_param,
                         device = extract_device,
                     )
+                    if decompose_mode == 'low rank':
+                        extract_a, extract_b, diff = weight
                 elif layer == 'Conv2d':
+                    is_linear = (
+                        root_weight.shape[2] == 1
+                        and root_weight.shape[3] == 1
+                    )
+                    weight, decompose_mode = extract_conv(
+                        (root_weight - weights),
                         mode,
                         linear_mode_param if is_linear else conv_mode_param,
                         device = extract_device,
                     )
+                    if decompose_mode == 'low rank':
+                        extract_a, extract_b, diff = weight
+                    if small_conv and not is_linear and decompose_mode == 'low rank':
                         dim = extract_a.size(0)
+                        (extract_c, extract_a, _), _ = extract_conv(
                             extract_a.transpose(0, 1),
                             'fixed', dim,
+                            extract_device, True
                         )
                         extract_a = extract_a.transpose(0, 1)
                         extract_c = extract_c.transpose(0, 1)
                         loras[f'{lora_name}.lora_mid.weight'] = extract_c.detach().cpu().contiguous().half()
+                        diff = root_weight - torch.einsum(
                             'i j k l, j r, p i -> p r k l',
                             extract_c, extract_a.flatten(1, -1), extract_b.flatten(1, -1)
                         ).detach().cpu().contiguous()
                         del extract_c
                 else:
                     continue
+                if decompose_mode == 'low rank':
+                    loras[f'{lora_name}.lora_down.weight'] = extract_a.detach().cpu().contiguous().half()
+                    loras[f'{lora_name}.lora_up.weight'] = extract_b.detach().cpu().contiguous().half()
+                    loras[f'{lora_name}.alpha'] = torch.Tensor([extract_a.shape[0]]).half()
+                    if use_bias:
+                        diff = diff.detach().cpu().reshape(extract_b.size(0), -1)
+                        sparse_diff = make_sparse(diff, sparsity).to_sparse().coalesce()
+                        indices = sparse_diff.indices().to(torch.int16)
+                        values = sparse_diff.values().half()
+                        loras[f'{lora_name}.bias_indices'] = indices
+                        loras[f'{lora_name}.bias_values'] = values
+                        loras[f'{lora_name}.bias_size'] = torch.tensor(diff.shape).to(torch.int16)
+                    del extract_a, extract_b, diff
+                elif decompose_mode == 'full':
+                    loras[f'{lora_name}.diff'] = weight.detach().cpu().contiguous().half()
+                else:
+                    raise NotImplementedError
         return loras
     text_encoder_loras = make_state_dict(
     return text_encoder_loras|unet_loras
+def get_module(
+    lyco_state_dict: Dict,
+    lora_name
 ):
+    if f'{lora_name}.lora_up.weight' in lyco_state_dict:
+        up = lyco_state_dict[f'{lora_name}.lora_up.weight']
+        down = lyco_state_dict[f'{lora_name}.lora_down.weight']
+        mid = lyco_state_dict.get(f'{lora_name}.lora_mid.weight', None)
+        alpha = lyco_state_dict.get(f'{lora_name}.alpha', None)
+        return 'locon', (up, down, mid, alpha)
+    elif f'{lora_name}.hada_w1_a' in lyco_state_dict:
+        w1a = lyco_state_dict[f'{lora_name}.hada_w1_a']
+        w1b = lyco_state_dict[f'{lora_name}.hada_w1_b']
+        w2a = lyco_state_dict[f'{lora_name}.hada_w2_a']
+        w2b = lyco_state_dict[f'{lora_name}.hada_w2_b']
+        t1 = lyco_state_dict.get(f'{lora_name}.hada_t1', None)
+        t2 = lyco_state_dict.get(f'{lora_name}.hada_t2', None)
+        alpha = lyco_state_dict.get(f'{lora_name}.alpha', None)
+        return 'hada', (w1a, w1b, w2a, w2b, t1, t2, alpha)
+    elif f'{lora_name}.weight' in lyco_state_dict:
+        weight = lyco_state_dict[f'{lora_name}.weight']
+        on_input = lyco_state_dict.get(f'{lora_name}.on_input', False)
+        return 'ia3', (weight, on_input)
+    elif (f'{lora_name}.lokr_w1' in lyco_state_dict
+          or f'{lora_name}.lokr_w1_a' in lyco_state_dict):
+        w1 = lyco_state_dict.get(f'{lora_name}.lokr_w1', None)
+        w1a = lyco_state_dict.get(f'{lora_name}.lokr_w1_a', None)
+        w1b = lyco_state_dict.get(f'{lora_name}.lokr_w1_b', None)
+        w2 = lyco_state_dict.get(f'{lora_name}.lokr_w2', None)
+        w2a = lyco_state_dict.get(f'{lora_name}.lokr_w2_a', None)
+        w2b = lyco_state_dict.get(f'{lora_name}.lokr_w2_b', None)
+        t1 = lyco_state_dict.get(f'{lora_name}.lokr_t1', None)
+        t2 = lyco_state_dict.get(f'{lora_name}.lokr_t2', None)
+        alpha = lyco_state_dict.get(f'{lora_name}.alpha', None)
+        return 'kron', (w1, w1a, w1b, w2, w2a, w2b, t1, t2, alpha)
+    elif f'{lora_name}.diff' in lyco_state_dict:
+        return 'full', lyco_state_dict[f'{lora_name}.diff']
+    else:
+        return 'None', ()
+def cp_weight_from_conv(
+    up, down, mid
+):
+    up = up.reshape(up.size(0), up.size(1))
+    down = down.reshape(down.size(0), down.size(1))
+    return torch.einsum('m n w h, i m, n j -> i j w h', mid, up, down)
+def cp_weight(
+    wa, wb, t
+):
+    temp = torch.einsum('i j k l, j r -> i r k l', t, wb)
+    return torch.einsum('i j k l, i r -> r j k l', temp, wa)
+@torch.no_grad()
+def rebuild_weight(module_type, params, orig_weight, scale=1):
+    if orig_weight is None:
+        return orig_weight
+    merged = orig_weight
+    if module_type == 'locon':
+        up, down, mid, alpha = params
+        if alpha is not None:
+            scale *= alpha/up.size(1)
+        if mid is not None:
+            rebuild = cp_weight_from_conv(up, down, mid)
+        else:
+            rebuild = up.reshape(up.size(0),-1) @ down.reshape(down.size(0), -1)
+        merged = orig_weight + rebuild.reshape(orig_weight.shape) * scale
+        del up, down, mid, alpha, params, rebuild
+    elif module_type == 'hada':
+        w1a, w1b, w2a, w2b, t1, t2, alpha = params
+        if alpha is not None:
+            scale *= alpha / w1b.size(0)
+        if t1 is not None:
+            rebuild1 = cp_weight(w1a, w1b, t1)
+        else:
+            rebuild1 = w1a @ w1b
+        if t2 is not None:
+            rebuild2 = cp_weight(w2a, w2b, t2)
+        else:
+            rebuild2 = w2a @ w2b
+        rebuild = (rebuild1 * rebuild2).reshape(orig_weight.shape)
+        merged = orig_weight + rebuild * scale
+        del w1a, w1b, w2a, w2b, t1, t2, alpha, params, rebuild, rebuild1, rebuild2
+    elif module_type == 'ia3':
+        weight, on_input = params
+        if not on_input:
+            weight = weight.reshape(-1, 1)
+        merged = orig_weight + weight * orig_weight * scale
+        del weight, on_input, params
+    elif module_type == 'kron':
+        w1, w1a, w1b, w2, w2a, w2b, t1, t2, alpha = params
+        if alpha is not None and (w1b is not None or w2b is not None):
+            scale *= alpha / (w1b.size(0) if w1b else w2b.size(0))
+        if w1a is not None and w1b is not None:
+            if t1:
+                w1 = cp_weight(w1a, w1b, t1)
+            else:
+                w1 = w1a @ w1b
+        if w2a is not None and w2b is not None:
+            if t2:
+                w2 = cp_weight(w2a, w2b, t2)
+            else:
+                w2 = w2a @ w2b
+        rebuild = torch.kron(w1, w2).reshape(orig_weight.shape)
+        merged = orig_weight + rebuild* scale
+        del w1, w1a, w1b, w2, w2a, w2b, t1, t2, alpha, params, rebuild
+    elif module_type == 'full':
+        rebuild = params.reshape(orig_weight.shape)
+        merged = orig_weight + rebuild * scale
+        del params, rebuild
+    return merged
+def merge(
     base_model,
+    lyco_state_dict,
     scale: float = 1.0,
     device = 'cpu'
 ):
         "Downsample2D",
         "Upsample2D"
     ]
+    UNET_TARGET_REPLACE_NAME = [
+        "conv_in",
+        "conv_out",
+        "time_embedding.linear_1",
+        "time_embedding.linear_2",
+    ]
     TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"]
     LORA_PREFIX_UNET = 'lora_unet'
     LORA_PREFIX_TEXT_ENCODER = 'lora_te'
+    merged = 0
+    def merge_state_dict(
         prefix,
         root_module: torch.nn.Module,
+        lyco_state_dict: Dict[str,torch.Tensor],
+        target_replace_modules,
+        target_replace_names = []
     ):
+        nonlocal merged
+        for name, module in tqdm(list(root_module.named_modules()), desc=f'Merging {prefix}'):
             if module.__class__.__name__ in target_replace_modules:
                 for child_name, child_module in module.named_modules():
+                    if child_module.__class__.__name__ not in {'Linear', 'Conv2d'}:
                         continue
                     lora_name = prefix + '.' + name + '.' + child_name
                     lora_name = lora_name.replace('.', '_')
+                    result = rebuild_weight(*get_module(
+                        lyco_state_dict, lora_name
+                    ), getattr(child_module, 'weight'), scale)
+                    if result is not None:
+                        merged += 1
+                        child_module.requires_grad_(False)
+                        child_module.weight.copy_(result)
+            elif name in target_replace_names:
+                lora_name = prefix + '.' + name
+                lora_name = lora_name.replace('.', '_')
+                result = rebuild_weight(*get_module(
+                    lyco_state_dict, lora_name
+                ), getattr(module, 'weight'), scale)
+                if result is not None:
+                    merged += 1
+                    module.requires_grad_(False)
+                    module.weight.copy_(result)
+    if device == 'cpu':
+        for k, v in tqdm(list(lyco_state_dict.items()), desc='Converting Dtype'):
+            lyco_state_dict[k] = v.float()
+    merge_state_dict(
+        LORA_PREFIX_TEXT_ENCODER,
+        base_model[0],
+        lyco_state_dict,
+        TEXT_ENCODER_TARGET_REPLACE_MODULE,
+        UNET_TARGET_REPLACE_NAME
     )
+    merge_state_dict(
         LORA_PREFIX_UNET,
+        base_model[2],
+        lyco_state_dict,
+        UNET_TARGET_REPLACE_MODULE,
+        UNET_TARGET_REPLACE_NAME
+    )
+    print(f'{merged} Modules been merged')