import itertools from typing import Any, Optional, Dict, Tuple import torch from diffusers import StableDiffusionPipeline, AutoencoderKL from diffusers import Transformer2DModel, ModelMixin, ConfigMixin from diffusers import UNet2DConditionModel from diffusers.configuration_utils import register_to_config from diffusers.models.attention import BasicTransformerBlock from diffusers.models.resnet import ResnetBlock2D, Downsample2D, Upsample2D try: from diffusers.models.transformer_2d import Transformer2DModelOutput except ImportError: from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers from torch import nn from transformers import CLIPTextModel, CLIPTokenizer, CLIPImageProcessor FlexibleUnetConfigurations = { # General parameters for all blocks 'sample_size': 64, 'temb_dim': 320 * 4, 'resnet_eps': 1e-5, 'resnet_act_fn': 'silu', 'num_attention_heads': 8, 'cross_attention_dim': 768, # Controls modules execute order in unet's forward 'mix_block_in_forward': True, # Down blocks parameters 'down_blocks_in_channels': [320, 320, 640], 'down_blocks_out_channels': [320, 640, 1280], 'down_blocks_num_attentions': [0, 1, 3], 'down_blocks_num_resnets': [2, 2, 1], 'add_downsample': [True, True, True], # Middle block parameters 'add_upsample_mid_block': True, 'mid_num_resnets': 4, 'mid_num_attentions': 2, # Up block parameters 'prev_output_channels': [1280, 1280, 640], 'up_blocks_num_attentions': [6, 3, 0], 'up_blocks_num_resnets': [2, 3, 3], 'add_upsample': [True, True, False], } def custom_sort_order(obj): """ Key function for sorting order of execution in forward methods """ return {ResnetBlock2D: 0, Transformer2DModel: 1, FlexibleTransformer2DModel: 1}.get(obj.__class__) class FlexibleIdentityBlock(nn.Module): def forward( self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, ): return hidden_states class FlexibleUNet2DConditionModel(UNet2DConditionModel, ModelMixin): configurations = FlexibleUnetConfigurations @register_to_config def __init__(self): super().__init__(sample_size=self.configurations.get('sample_size', FlexibleUnetConfigurations['sample_size']), cross_attention_dim=self.configurations.get("cross_attention_dim", FlexibleUnetConfigurations['cross_attention_dim'])) num_attention_heads = self.configurations.get("num_attention_heads") cross_attention_dim = self.configurations.get("cross_attention_dim") mix_block_in_forward = self.configurations.get("mix_block_in_forward") resnet_act_fn = self.configurations.get("resnet_act_fn") resnet_eps = self.configurations.get("resnet_eps") temb_dim = self.configurations.get("temb_dim") ############### # Down blocks # ############### down_blocks_num_attentions = self.configurations.get("down_blocks_num_attentions") down_blocks_out_channels = self.configurations.get("down_blocks_out_channels") down_blocks_in_channels = self.configurations.get("down_blocks_in_channels") down_blocks_num_resnets = self.configurations.get("down_blocks_num_resnets") add_downsample = self.configurations.get("add_downsample") self.down_blocks = nn.ModuleList() for i, (in_c, out_c, n_res, n_att, add_down) in enumerate(zip(down_blocks_in_channels, down_blocks_out_channels, down_blocks_num_resnets, down_blocks_num_attentions, add_downsample)): last_block = i == len(down_blocks_in_channels) - 1 self.down_blocks.append(FlexibleCrossAttnDownBlock2D(in_channels=in_c, out_channels=out_c, temb_channels=temb_dim, num_resnets=n_res, num_attentions=n_att, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, num_attention_heads=num_attention_heads, cross_attention_dim=cross_attention_dim, add_downsample=add_down, last_block=last_block, mix_block_in_forward=mix_block_in_forward)) ############### # Mid blocks # ############### mid_block_add_upsample = self.configurations.get("add_upsample_mid_block") mid_num_attentions = self.configurations.get("mid_num_attentions") mid_num_resnets = self.configurations.get("mid_num_resnets") if mid_num_resnets == mid_num_attentions == 0: self.mid_block = FlexibleIdentityBlock() else: self.mid_block = FlexibleUNetMidBlock2DCrossAttn(in_channels=down_blocks_out_channels[-1], temb_channels=temb_dim, resnet_act_fn=resnet_act_fn, resnet_eps=resnet_eps, cross_attention_dim=cross_attention_dim, num_attention_heads=num_attention_heads, num_resnets=mid_num_resnets, num_attentions=mid_num_attentions, mix_block_in_forward=mix_block_in_forward, add_upsample=mid_block_add_upsample ) ############### # Up blocks # ############### up_blocks_num_attentions = self.configurations.get("up_blocks_num_attentions") up_blocks_num_resnets = self.configurations.get("up_blocks_num_resnets") prev_output_channels = self.configurations.get("prev_output_channels") up_upsample = self.configurations.get("add_upsample") self.up_blocks = nn.ModuleList() for in_c, out_c, prev_out, n_res, n_att, add_up in zip(reversed(down_blocks_in_channels), reversed(down_blocks_out_channels), prev_output_channels, up_blocks_num_resnets, up_blocks_num_attentions, up_upsample): self.up_blocks.append(FlexibleCrossAttnUpBlock2D(in_channels=in_c, out_channels=out_c, prev_output_channel=prev_out, temb_channels=temb_dim, num_resnets=n_res, num_attentions=n_att, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, num_attention_heads=num_attention_heads, cross_attention_dim=cross_attention_dim, add_upsample=add_up, mix_block_in_forward=mix_block_in_forward )) class FlexibleCrossAttnDownBlock2D(nn.Module): def __init__( self, in_channels: int, out_channels: int, temb_channels: int, dropout: float = 0.0, num_resnets: int = 1, num_attentions: int = 1, transformer_layers_per_block: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, num_attention_heads: int = 1, cross_attention_dim: int = 1280, output_scale_factor: float = 1.0, downsample_padding: int = 1, add_downsample: bool = True, use_linear_projection: bool = False, only_cross_attention: bool = False, upcast_attention: bool = False, last_block: bool = False, mix_block_in_forward: bool = True, ): super().__init__() self.last_block = last_block self.mix_block_in_forward = mix_block_in_forward self.has_cross_attention = True self.num_attention_heads = num_attention_heads modules = [] add_resnets = [True] * num_resnets add_cross_attentions = [True] * num_attentions for i, (add_resnet, add_cross_attention) in enumerate( itertools.zip_longest(add_resnets, add_cross_attentions, fillvalue=False)): in_channels = in_channels if i == 0 else out_channels if add_resnet: modules.append( ResnetBlock2D( in_channels=in_channels, out_channels=out_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, dropout=dropout, time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, ) ) if add_cross_attention: modules.append( FlexibleTransformer2DModel( num_attention_heads=num_attention_heads, attention_head_dim=out_channels // num_attention_heads, in_channels=out_channels, num_layers=transformer_layers_per_block, cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, ) ) if not mix_block_in_forward: modules = sorted(modules, key=custom_sort_order) self.modules_list = nn.ModuleList(modules) if add_downsample: self.downsamplers = nn.ModuleList( [ Downsample2D( out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op" ) ] ) else: self.downsamplers = None self.gradient_checkpointing = False def forward( self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, ): output_states = () for module in self.modules_list: if isinstance(module, ResnetBlock2D): hidden_states = module(hidden_states, temb) elif isinstance(module, (FlexibleTransformer2DModel, Transformer2DModel)): hidden_states = module( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, attention_mask=attention_mask, encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] else: raise ValueError(f'Got an unexpected module in modules list! {type(module)}') if isinstance(module, ResnetBlock2D): output_states = output_states + (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states) if not self.last_block: output_states = output_states + (hidden_states,) return hidden_states, output_states class FlexibleCrossAttnUpBlock2D(nn.Module): def __init__( self, in_channels: int, out_channels: int, prev_output_channel: int, temb_channels: int, dropout: float = 0.0, num_resnets: int = 1, num_attentions: int = 1, transformer_layers_per_block: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, num_attention_heads: int = 1, cross_attention_dim: int = 1280, output_scale_factor: float = 1.0, add_upsample: bool = True, use_linear_projection: bool = False, only_cross_attention: bool = False, upcast_attention: bool = False, mix_block_in_forward: bool = True ): super().__init__() modules = [] # WARNING: This parameter is filled with number of resnets and used within StableDiffusionPipeline self.resnets = [] self.has_cross_attention = True self.num_attention_heads = num_attention_heads add_resnets = [True] * num_resnets add_cross_attentions = [True] * num_attentions for i, (add_resnet, add_cross_attention) in enumerate( itertools.zip_longest(add_resnets, add_cross_attentions, fillvalue=False)): res_skip_channels = in_channels if (i == len(add_resnets) - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels if add_resnet: self.resnets += [True] modules.append( ResnetBlock2D( in_channels=resnet_in_channels + res_skip_channels, out_channels=out_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, dropout=dropout, time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, ) ) if add_cross_attention: modules.append( FlexibleTransformer2DModel( num_attention_heads, out_channels // num_attention_heads, in_channels=out_channels, num_layers=transformer_layers_per_block, cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, ) ) if not mix_block_in_forward: modules = sorted(modules, key=custom_sort_order) self.modules_list = nn.ModuleList(modules) self.upsamplers = None if add_upsample: self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]) self.gradient_checkpointing = False def forward( self, hidden_states: torch.FloatTensor, res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], temb: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, ): for module in self.modules_list: if isinstance(module, ResnetBlock2D): res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) hidden_states = module(hidden_states, temb) if isinstance(module, (FlexibleTransformer2DModel, Transformer2DModel)): hidden_states = module( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, attention_mask=attention_mask, encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] if self.upsamplers is not None: for upsampler in self.upsamplers: hidden_states = upsampler(hidden_states, upsample_size) return hidden_states class FlexibleUNetMidBlock2DCrossAttn(nn.Module): def __init__( self, in_channels: int, temb_channels: int, dropout: float = 0.0, num_resnets: int = 1, num_attentions: int = 1, transformer_layers_per_block: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, num_attention_heads: int = 1, output_scale_factor: float = 1.0, cross_attention_dim: int = 1280, use_linear_projection: bool = False, upcast_attention: bool = False, mix_block_in_forward: bool = True, add_upsample: bool = True, ): super().__init__() self.has_cross_attention = True self.num_attention_heads = num_attention_heads resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) # There is always at least one resnet modules = [ResnetBlock2D( in_channels=in_channels, out_channels=in_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, dropout=dropout, time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, )] add_resnets = [True] * num_resnets add_cross_attentions = [True] * num_attentions for i, (add_resnet, add_cross_attention) in enumerate( itertools.zip_longest(add_resnets, add_cross_attentions, fillvalue=False)): if add_cross_attention: modules.append( FlexibleTransformer2DModel( num_attention_heads, in_channels // num_attention_heads, in_channels=in_channels, num_layers=transformer_layers_per_block, cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, upcast_attention=upcast_attention, ) ) if add_resnet: modules.append( ResnetBlock2D( in_channels=in_channels, out_channels=in_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, dropout=dropout, time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, ) ) if not mix_block_in_forward: modules = sorted(modules, key=custom_sort_order) self.modules_list = nn.ModuleList(modules) self.upsamplers = nn.ModuleList([nn.Identity()]) if add_upsample: self.upsamplers = nn.ModuleList([Upsample2D(in_channels, use_conv=True, out_channels=in_channels)]) def forward( self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, ) -> torch.FloatTensor: hidden_states = self.modules_list[0](hidden_states, temb) for module in self.modules_list: if isinstance(module, (FlexibleTransformer2DModel, Transformer2DModel)): hidden_states = module( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, attention_mask=attention_mask, encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] elif isinstance(module, ResnetBlock2D): hidden_states = module(hidden_states, temb) for upsampler in self.upsamplers: hidden_states = upsampler(hidden_states) return hidden_states class FlexibleTransformer2DModel(ModelMixin, ConfigMixin): @register_to_config def __init__( self, num_attention_heads: int = 16, attention_head_dim: int = 88, in_channels: Optional[int] = None, out_channels: Optional[int] = None, num_layers: int = 1, dropout: float = 0.0, norm_num_groups: int = 32, cross_attention_dim: Optional[int] = None, attention_bias: bool = False, activation_fn: str = "geglu", num_embeds_ada_norm: Optional[int] = None, only_cross_attention: bool = False, use_linear_projection: bool = False, upcast_attention: bool = False, norm_type: str = "layer_norm", norm_elementwise_affine: bool = True, ): super().__init__() self.num_attention_heads = num_attention_heads self.attention_head_dim = attention_head_dim self.in_channels = in_channels inner_dim = num_attention_heads * attention_head_dim # Define input layers self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True) self.use_linear_projection = use_linear_projection if self.use_linear_projection: self.proj_in = nn.Linear(in_channels, inner_dim) else: self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0) # Define transformers blocks self.transformer_blocks = nn.ModuleList( [ BasicTransformerBlock( inner_dim, num_attention_heads, attention_head_dim, dropout=dropout, cross_attention_dim=cross_attention_dim, activation_fn=activation_fn, num_embeds_ada_norm=num_embeds_ada_norm, attention_bias=attention_bias, only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, norm_type=norm_type, norm_elementwise_affine=norm_elementwise_affine, ) for _ in range(num_layers) ] ) # Define output layers self.out_channels = in_channels if out_channels is None else out_channels if self.use_linear_projection: self.proj_out = nn.Linear(inner_dim, in_channels) else: self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0) def forward( self, hidden_states: torch.Tensor, encoder_hidden_states: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, class_labels: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, attention_mask: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, return_dict: bool = False ): # 1. Input batch, _, height, width = hidden_states.shape residual = hidden_states hidden_states = self.norm(hidden_states) if not self.use_linear_projection: hidden_states = self.proj_in(hidden_states) inner_dim = hidden_states.shape[1] hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim) else: inner_dim = hidden_states.shape[1] hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim) hidden_states = self.proj_in(hidden_states) # 2. Blocks for block in self.transformer_blocks: hidden_states = block( hidden_states, attention_mask=attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, timestep=timestep, cross_attention_kwargs=cross_attention_kwargs, class_labels=class_labels, ) # 3. Output if not self.use_linear_projection: hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous() hidden_states = self.proj_out(hidden_states) else: hidden_states = self.proj_out(hidden_states) hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous() output = hidden_states + residual if return_dict: return (output,) return Transformer2DModelOutput(sample=output) class DeciDiffusionPipeline(StableDiffusionPipeline): deci_default_number_of_iterations = 30 deci_default_guidance_rescale = 0.7 def __init__(self, vae: AutoencoderKL, text_encoder: CLIPTextModel, tokenizer: CLIPTokenizer, unet: UNet2DConditionModel, scheduler: KarrasDiffusionSchedulers, safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPImageProcessor, requires_safety_checker: bool = True ): # Replace UNet with Deci`s unet del unet unet = FlexibleUNet2DConditionModel() super().__init__(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, requires_safety_checker=requires_safety_checker ) self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor) def __call__(self, *args, **kwargs): # Set up default training parameters (if not given by user specifically) if "guidance_rescale" not in kwargs: kwargs.update({'guidance_rescale': self.deci_default_guidance_rescale}) if "num_inference_steps" not in kwargs: kwargs.update({'num_inference_steps': self.deci_default_number_of_iterations}) return super().__call__(*args, **kwargs)