Spaces:

anchorxia
/

MuseVSpace

Runtime error

App Files Files Community

MuseVSpace / MuseV /musev /models /referencenet.py

anchorxia

add musev

96d7ad8 6 months ago

raw

history blame contribute delete

55.6 kB

	# Copyright 2023 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from __future__ import annotations

	from typing import Any, Dict, List, Optional, Tuple, Union
	import logging

	import torch
	from diffusers.models.attention_processor import Attention, AttnProcessor
	from einops import rearrange, repeat
	import torch.nn as nn
	import torch.nn.functional as F
	import xformers
	from diffusers.models.lora import LoRACompatibleLinear
	from diffusers.models.unet_2d_condition import (
	UNet2DConditionModel,
	UNet2DConditionOutput,
	)
	from diffusers.configuration_utils import ConfigMixin, register_to_config
	from diffusers.utils.constants import USE_PEFT_BACKEND
	from diffusers.utils.deprecation_utils import deprecate
	from diffusers.utils.peft_utils import scale_lora_layers, unscale_lora_layers
	from diffusers.utils.torch_utils import maybe_allow_in_graph
	from diffusers.models.modeling_utils import ModelMixin, load_state_dict
	from diffusers.loaders import UNet2DConditionLoadersMixin
	from diffusers.utils import (
	USE_PEFT_BACKEND,
	BaseOutput,
	deprecate,
	scale_lora_layers,
	unscale_lora_layers,
	)
	from diffusers.models.activations import get_activation
	from diffusers.models.attention_processor import (
	ADDED_KV_ATTENTION_PROCESSORS,
	CROSS_ATTENTION_PROCESSORS,
	AttentionProcessor,
	AttnAddedKVProcessor,
	AttnProcessor,
	)
	from diffusers.models.embeddings import (
	GaussianFourierProjection,
	ImageHintTimeEmbedding,
	ImageProjection,
	ImageTimeEmbedding,
	PositionNet,
	TextImageProjection,
	TextImageTimeEmbedding,
	TextTimeEmbedding,
	TimestepEmbedding,
	Timesteps,
	)
	from diffusers.models.modeling_utils import ModelMixin


	from ..data.data_util import align_repeat_tensor_single_dim
	from .unet_3d_condition import UNet3DConditionModel
	from .attention import BasicTransformerBlock, IPAttention
	from .unet_2d_blocks import (
	UNetMidBlock2D,
	UNetMidBlock2DCrossAttn,
	UNetMidBlock2DSimpleCrossAttn,
	get_down_block,
	get_up_block,
	)

	from . import Model_Register


	logger = logging.getLogger(__name__)


	@Model_Register.register
	class ReferenceNet2D(UNet2DConditionModel, nn.Module):
	"""继承 UNet2DConditionModel. 新增功能，类似controlnet 返回模型中间特征，用于后续作用
	Inherit Unet2DConditionModel. Add new functions, similar to controlnet, return the intermediate features of the model for subsequent effects
	Args:
	UNet2DConditionModel (_type_): _description_
	"""

	_supports_gradient_checkpointing = True
	print_idx = 0

	@register_to_config
	def __init__(
	self,
	sample_size: int \| None = None,
	in_channels: int = 4,
	out_channels: int = 4,
	center_input_sample: bool = False,
	flip_sin_to_cos: bool = True,
	freq_shift: int = 0,
	down_block_types: Tuple[str] = (
	"CrossAttnDownBlock2D",
	"CrossAttnDownBlock2D",
	"CrossAttnDownBlock2D",
	"DownBlock2D",
	),
	mid_block_type: str \| None = "UNetMidBlock2DCrossAttn",
	up_block_types: Tuple[str] = (
	"UpBlock2D",
	"CrossAttnUpBlock2D",
	"CrossAttnUpBlock2D",
	"CrossAttnUpBlock2D",
	),
	only_cross_attention: bool \| Tuple[bool] = False,
	block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
	layers_per_block: int \| Tuple[int] = 2,
	downsample_padding: int = 1,
	mid_block_scale_factor: float = 1,
	dropout: float = 0,
	act_fn: str = "silu",
	norm_num_groups: int \| None = 32,
	norm_eps: float = 0.00001,
	cross_attention_dim: int \| Tuple[int] = 1280,
	transformer_layers_per_block: int \| Tuple[int] \| Tuple[Tuple] = 1,
	reverse_transformer_layers_per_block: Tuple[Tuple[int]] \| None = None,
	encoder_hid_dim: int \| None = None,
	encoder_hid_dim_type: str \| None = None,
	attention_head_dim: int \| Tuple[int] = 8,
	num_attention_heads: int \| Tuple[int] \| None = None,
	dual_cross_attention: bool = False,
	use_linear_projection: bool = False,
	class_embed_type: str \| None = None,
	addition_embed_type: str \| None = None,
	addition_time_embed_dim: int \| None = None,
	num_class_embeds: int \| None = None,
	upcast_attention: bool = False,
	resnet_time_scale_shift: str = "default",
	resnet_skip_time_act: bool = False,
	resnet_out_scale_factor: int = 1,
	time_embedding_type: str = "positional",
	time_embedding_dim: int \| None = None,
	time_embedding_act_fn: str \| None = None,
	timestep_post_act: str \| None = None,
	time_cond_proj_dim: int \| None = None,
	conv_in_kernel: int = 3,
	conv_out_kernel: int = 3,
	projection_class_embeddings_input_dim: int \| None = None,
	attention_type: str = "default",
	class_embeddings_concat: bool = False,
	mid_block_only_cross_attention: bool \| None = None,
	cross_attention_norm: str \| None = None,
	addition_embed_type_num_heads=64,
	need_self_attn_block_embs: bool = False,
	need_block_embs: bool = False,
	):
	super().__init__()

	self.sample_size = sample_size

	if num_attention_heads is not None:
	raise ValueError(
	"At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
	)

	# If `num_attention_heads` is not defined (which is the case for most models)
	# it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
	# The reason for this behavior is to correct for incorrectly named variables that were introduced
	# when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
	# Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
	# which is why we correct for the naming here.
	num_attention_heads = num_attention_heads or attention_head_dim

	# Check inputs
	if len(down_block_types) != len(up_block_types):
	raise ValueError(
	f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
	)

	if len(block_out_channels) != len(down_block_types):
	raise ValueError(
	f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
	)

	if not isinstance(only_cross_attention, bool) and len(
	only_cross_attention
	) != len(down_block_types):
	raise ValueError(
	f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
	)

	if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(
	down_block_types
	):
	raise ValueError(
	f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
	)

	if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(
	down_block_types
	):
	raise ValueError(
	f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
	)

	if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(
	down_block_types
	):
	raise ValueError(
	f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
	)

	if not isinstance(layers_per_block, int) and len(layers_per_block) != len(
	down_block_types
	):
	raise ValueError(
	f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
	)
	if (
	isinstance(transformer_layers_per_block, list)
	and reverse_transformer_layers_per_block is None
	):
	for layer_number_per_block in transformer_layers_per_block:
	if isinstance(layer_number_per_block, list):
	raise ValueError(
	"Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet."
	)

	# input
	conv_in_padding = (conv_in_kernel - 1) // 2
	self.conv_in = nn.Conv2d(
	in_channels,
	block_out_channels[0],
	kernel_size=conv_in_kernel,
	padding=conv_in_padding,
	)

	# time
	if time_embedding_type == "fourier":
	time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
	if time_embed_dim % 2 != 0:
	raise ValueError(
	f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}."
	)
	self.time_proj = GaussianFourierProjection(
	time_embed_dim // 2,
	set_W_to_weight=False,
	log=False,
	flip_sin_to_cos=flip_sin_to_cos,
	)
	timestep_input_dim = time_embed_dim
	elif time_embedding_type == "positional":
	time_embed_dim = time_embedding_dim or block_out_channels[0] * 4

	self.time_proj = Timesteps(
	block_out_channels[0], flip_sin_to_cos, freq_shift
	)
	timestep_input_dim = block_out_channels[0]
	else:
	raise ValueError(
	f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
	)

	self.time_embedding = TimestepEmbedding(
	timestep_input_dim,
	time_embed_dim,
	act_fn=act_fn,
	post_act_fn=timestep_post_act,
	cond_proj_dim=time_cond_proj_dim,
	)

	if encoder_hid_dim_type is None and encoder_hid_dim is not None:
	encoder_hid_dim_type = "text_proj"
	self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
	logger.info(
	"encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined."
	)

	if encoder_hid_dim is None and encoder_hid_dim_type is not None:
	raise ValueError(
	f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
	)

	if encoder_hid_dim_type == "text_proj":
	self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
	elif encoder_hid_dim_type == "text_image_proj":
	# image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
	# they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
	# case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
	self.encoder_hid_proj = TextImageProjection(
	text_embed_dim=encoder_hid_dim,
	image_embed_dim=cross_attention_dim,
	cross_attention_dim=cross_attention_dim,
	)
	elif encoder_hid_dim_type == "image_proj":
	# Kandinsky 2.2
	self.encoder_hid_proj = ImageProjection(
	image_embed_dim=encoder_hid_dim,
	cross_attention_dim=cross_attention_dim,
	)
	elif encoder_hid_dim_type is not None:
	raise ValueError(
	f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
	)
	else:
	self.encoder_hid_proj = None

	# class embedding
	if class_embed_type is None and num_class_embeds is not None:
	self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
	elif class_embed_type == "timestep":
	self.class_embedding = TimestepEmbedding(
	timestep_input_dim, time_embed_dim, act_fn=act_fn
	)
	elif class_embed_type == "identity":
	self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
	elif class_embed_type == "projection":
	if projection_class_embeddings_input_dim is None:
	raise ValueError(
	"`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
	)
	# The projection `class_embed_type` is the same as the timestep `class_embed_type` except
	# 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
	# 2. it projects from an arbitrary input dimension.
	#
	# Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
	# When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
	# As a result, `TimestepEmbedding` can be passed arbitrary vectors.
	self.class_embedding = TimestepEmbedding(
	projection_class_embeddings_input_dim, time_embed_dim
	)
	elif class_embed_type == "simple_projection":
	if projection_class_embeddings_input_dim is None:
	raise ValueError(
	"`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
	)
	self.class_embedding = nn.Linear(
	projection_class_embeddings_input_dim, time_embed_dim
	)
	else:
	self.class_embedding = None

	if addition_embed_type == "text":
	if encoder_hid_dim is not None:
	text_time_embedding_from_dim = encoder_hid_dim
	else:
	text_time_embedding_from_dim = cross_attention_dim

	self.add_embedding = TextTimeEmbedding(
	text_time_embedding_from_dim,
	time_embed_dim,
	num_heads=addition_embed_type_num_heads,
	)
	elif addition_embed_type == "text_image":
	# text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
	# they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
	# case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
	self.add_embedding = TextImageTimeEmbedding(
	text_embed_dim=cross_attention_dim,
	image_embed_dim=cross_attention_dim,
	time_embed_dim=time_embed_dim,
	)
	elif addition_embed_type == "text_time":
	self.add_time_proj = Timesteps(
	addition_time_embed_dim, flip_sin_to_cos, freq_shift
	)
	self.add_embedding = TimestepEmbedding(
	projection_class_embeddings_input_dim, time_embed_dim
	)
	elif addition_embed_type == "image":
	# Kandinsky 2.2
	self.add_embedding = ImageTimeEmbedding(
	image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
	)
	elif addition_embed_type == "image_hint":
	# Kandinsky 2.2 ControlNet
	self.add_embedding = ImageHintTimeEmbedding(
	image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
	)
	elif addition_embed_type is not None:
	raise ValueError(
	f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'."
	)

	if time_embedding_act_fn is None:
	self.time_embed_act = None
	else:
	self.time_embed_act = get_activation(time_embedding_act_fn)

	self.down_blocks = nn.ModuleList([])
	self.up_blocks = nn.ModuleList([])

	if isinstance(only_cross_attention, bool):
	if mid_block_only_cross_attention is None:
	mid_block_only_cross_attention = only_cross_attention

	only_cross_attention = [only_cross_attention] * len(down_block_types)

	if mid_block_only_cross_attention is None:
	mid_block_only_cross_attention = False

	if isinstance(num_attention_heads, int):
	num_attention_heads = (num_attention_heads,) * len(down_block_types)

	if isinstance(attention_head_dim, int):
	attention_head_dim = (attention_head_dim,) * len(down_block_types)

	if isinstance(cross_attention_dim, int):
	cross_attention_dim = (cross_attention_dim,) * len(down_block_types)

	if isinstance(layers_per_block, int):
	layers_per_block = [layers_per_block] * len(down_block_types)

	if isinstance(transformer_layers_per_block, int):
	transformer_layers_per_block = [transformer_layers_per_block] * len(
	down_block_types
	)

	if class_embeddings_concat:
	# The time embeddings are concatenated with the class embeddings. The dimension of the
	# time embeddings passed to the down, middle, and up blocks is twice the dimension of the
	# regular time embeddings
	blocks_time_embed_dim = time_embed_dim * 2
	else:
	blocks_time_embed_dim = time_embed_dim

	# down
	output_channel = block_out_channels[0]
	for i, down_block_type in enumerate(down_block_types):
	input_channel = output_channel
	output_channel = block_out_channels[i]
	is_final_block = i == len(block_out_channels) - 1

	down_block = get_down_block(
	down_block_type,
	num_layers=layers_per_block[i],
	transformer_layers_per_block=transformer_layers_per_block[i],
	in_channels=input_channel,
	out_channels=output_channel,
	temb_channels=blocks_time_embed_dim,
	add_downsample=not is_final_block,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	resnet_groups=norm_num_groups,
	cross_attention_dim=cross_attention_dim[i],
	num_attention_heads=num_attention_heads[i],
	downsample_padding=downsample_padding,
	dual_cross_attention=dual_cross_attention,
	use_linear_projection=use_linear_projection,
	only_cross_attention=only_cross_attention[i],
	upcast_attention=upcast_attention,
	resnet_time_scale_shift=resnet_time_scale_shift,
	attention_type=attention_type,
	resnet_skip_time_act=resnet_skip_time_act,
	resnet_out_scale_factor=resnet_out_scale_factor,
	cross_attention_norm=cross_attention_norm,
	attention_head_dim=attention_head_dim[i]
	if attention_head_dim[i] is not None
	else output_channel,
	dropout=dropout,
	)
	self.down_blocks.append(down_block)

	# mid
	if mid_block_type == "UNetMidBlock2DCrossAttn":
	self.mid_block = UNetMidBlock2DCrossAttn(
	transformer_layers_per_block=transformer_layers_per_block[-1],
	in_channels=block_out_channels[-1],
	temb_channels=blocks_time_embed_dim,
	dropout=dropout,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	output_scale_factor=mid_block_scale_factor,
	resnet_time_scale_shift=resnet_time_scale_shift,
	cross_attention_dim=cross_attention_dim[-1],
	num_attention_heads=num_attention_heads[-1],
	resnet_groups=norm_num_groups,
	dual_cross_attention=dual_cross_attention,
	use_linear_projection=use_linear_projection,
	upcast_attention=upcast_attention,
	attention_type=attention_type,
	)
	elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
	self.mid_block = UNetMidBlock2DSimpleCrossAttn(
	in_channels=block_out_channels[-1],
	temb_channels=blocks_time_embed_dim,
	dropout=dropout,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	output_scale_factor=mid_block_scale_factor,
	cross_attention_dim=cross_attention_dim[-1],
	attention_head_dim=attention_head_dim[-1],
	resnet_groups=norm_num_groups,
	resnet_time_scale_shift=resnet_time_scale_shift,
	skip_time_act=resnet_skip_time_act,
	only_cross_attention=mid_block_only_cross_attention,
	cross_attention_norm=cross_attention_norm,
	)
	elif mid_block_type == "UNetMidBlock2D":
	self.mid_block = UNetMidBlock2D(
	in_channels=block_out_channels[-1],
	temb_channels=blocks_time_embed_dim,
	dropout=dropout,
	num_layers=0,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	output_scale_factor=mid_block_scale_factor,
	resnet_groups=norm_num_groups,
	resnet_time_scale_shift=resnet_time_scale_shift,
	add_attention=False,
	)
	elif mid_block_type is None:
	self.mid_block = None
	else:
	raise ValueError(f"unknown mid_block_type : {mid_block_type}")

	# count how many layers upsample the images
	self.num_upsamplers = 0

	# up
	reversed_block_out_channels = list(reversed(block_out_channels))
	reversed_num_attention_heads = list(reversed(num_attention_heads))
	reversed_layers_per_block = list(reversed(layers_per_block))
	reversed_cross_attention_dim = list(reversed(cross_attention_dim))
	reversed_transformer_layers_per_block = (
	list(reversed(transformer_layers_per_block))
	if reverse_transformer_layers_per_block is None
	else reverse_transformer_layers_per_block
	)
	only_cross_attention = list(reversed(only_cross_attention))

	output_channel = reversed_block_out_channels[0]
	for i, up_block_type in enumerate(up_block_types):
	is_final_block = i == len(block_out_channels) - 1

	prev_output_channel = output_channel
	output_channel = reversed_block_out_channels[i]
	input_channel = reversed_block_out_channels[
	min(i + 1, len(block_out_channels) - 1)
	]

	# add upsample block for all BUT final layer
	if not is_final_block:
	add_upsample = True
	self.num_upsamplers += 1
	else:
	add_upsample = False

	up_block = get_up_block(
	up_block_type,
	num_layers=reversed_layers_per_block[i] + 1,
	transformer_layers_per_block=reversed_transformer_layers_per_block[i],
	in_channels=input_channel,
	out_channels=output_channel,
	prev_output_channel=prev_output_channel,
	temb_channels=blocks_time_embed_dim,
	add_upsample=add_upsample,
	resnet_eps=norm_eps,
	resnet_act_fn=act_fn,
	resolution_idx=i,
	resnet_groups=norm_num_groups,
	cross_attention_dim=reversed_cross_attention_dim[i],
	num_attention_heads=reversed_num_attention_heads[i],
	dual_cross_attention=dual_cross_attention,
	use_linear_projection=use_linear_projection,
	only_cross_attention=only_cross_attention[i],
	upcast_attention=upcast_attention,
	resnet_time_scale_shift=resnet_time_scale_shift,
	attention_type=attention_type,
	resnet_skip_time_act=resnet_skip_time_act,
	resnet_out_scale_factor=resnet_out_scale_factor,
	cross_attention_norm=cross_attention_norm,
	attention_head_dim=attention_head_dim[i]
	if attention_head_dim[i] is not None
	else output_channel,
	dropout=dropout,
	)
	self.up_blocks.append(up_block)
	prev_output_channel = output_channel

	# out
	if norm_num_groups is not None:
	self.conv_norm_out = nn.GroupNorm(
	num_channels=block_out_channels[0],
	num_groups=norm_num_groups,
	eps=norm_eps,
	)

	self.conv_act = get_activation(act_fn)

	else:
	self.conv_norm_out = None
	self.conv_act = None

	conv_out_padding = (conv_out_kernel - 1) // 2
	self.conv_out = nn.Conv2d(
	block_out_channels[0],
	out_channels,
	kernel_size=conv_out_kernel,
	padding=conv_out_padding,
	)

	if attention_type in ["gated", "gated-text-image"]:
	positive_len = 768
	if isinstance(cross_attention_dim, int):
	positive_len = cross_attention_dim
	elif isinstance(cross_attention_dim, tuple) or isinstance(
	cross_attention_dim, list
	):
	positive_len = cross_attention_dim[0]

	feature_type = "text-only" if attention_type == "gated" else "text-image"
	self.position_net = PositionNet(
	positive_len=positive_len,
	out_dim=cross_attention_dim,
	feature_type=feature_type,
	)
	self.need_block_embs = need_block_embs
	self.need_self_attn_block_embs = need_self_attn_block_embs

	# only use referencenet soma layers, other layers set None
	self.conv_norm_out = None
	self.conv_act = None
	self.conv_out = None

	self.up_blocks[-1].attentions[-1].proj_out = None
	self.up_blocks[-1].attentions[-1].transformer_blocks[-1].attn1 = None
	self.up_blocks[-1].attentions[-1].transformer_blocks[-1].attn2 = None
	self.up_blocks[-1].attentions[-1].transformer_blocks[-1].norm2 = None
	self.up_blocks[-1].attentions[-1].transformer_blocks[-1].ff = None
	self.up_blocks[-1].attentions[-1].transformer_blocks[-1].norm3 = None
	if not self.need_self_attn_block_embs:
	self.up_blocks = None

	self.insert_spatial_self_attn_idx()

	def forward(
	self,
	sample: torch.FloatTensor,
	timestep: Union[torch.Tensor, float, int],
	encoder_hidden_states: torch.Tensor,
	class_labels: Optional[torch.Tensor] = None,
	timestep_cond: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	cross_attention_kwargs: Optional[Dict[str, Any]] = None,
	added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
	down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
	mid_block_additional_residual: Optional[torch.Tensor] = None,
	down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
	encoder_attention_mask: Optional[torch.Tensor] = None,
	return_dict: bool = True,
	# update new paramestes start
	num_frames: int = None,
	return_ndim: int = 5,
	# update new paramestes end
	) -> Union[UNet2DConditionOutput, Tuple]:
	r"""
	The [`UNet2DConditionModel`] forward method.

	Args:
	sample (`torch.FloatTensor`):
	The noisy input tensor with the following shape `(batch, channel, height, width)`.
	timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
	encoder_hidden_states (`torch.FloatTensor`):
	The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
	class_labels (`torch.Tensor`, optional, defaults to `None`):
	Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
	timestep_cond: (`torch.Tensor`, optional, defaults to `None`):
	Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
	through the `self.time_embedding` layer to obtain the timestep embeddings.
	attention_mask (`torch.Tensor`, optional, defaults to `None`):
	An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
	is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
	negative values to the attention scores corresponding to "discard" tokens.
	cross_attention_kwargs (`dict`, optional):
	A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
	`self.processor` in
	[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
	added_cond_kwargs: (`dict`, optional):
	A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
	are passed along to the UNet blocks.
	down_block_additional_residuals: (`tuple` of `torch.Tensor`, optional):
	A tuple of tensors that if specified are added to the residuals of down unet blocks.
	mid_block_additional_residual: (`torch.Tensor`, optional):
	A tensor that if specified is added to the residual of the middle unet block.
	encoder_attention_mask (`torch.Tensor`):
	A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
	`True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
	which adds large negative values to the attention scores corresponding to "discard" tokens.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
	tuple.
	cross_attention_kwargs (`dict`, optional):
	A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
	added_cond_kwargs: (`dict`, optional):
	A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
	are passed along to the UNet blocks.
	down_block_additional_residuals (`tuple` of `torch.Tensor`, optional):
	additional residuals to be added to UNet long skip connections from down blocks to up blocks for
	example from ControlNet side model(s)
	mid_block_additional_residual (`torch.Tensor`, optional):
	additional residual to be added to UNet mid block output, for example from ControlNet side model
	down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, optional):
	additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)

	Returns:
	[`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
	If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
	a `tuple` is returned where the first element is the sample tensor.
	"""

	# By default samples have to be AT least a multiple of the overall upsampling factor.
	# The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
	# However, the upsampling interpolation output size can be forced to fit any upsampling size
	# on the fly if necessary.
	default_overall_up_factor = 2**self.num_upsamplers

	# upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
	forward_upsample_size = False
	upsample_size = None

	for dim in sample.shape[-2:]:
	if dim % default_overall_up_factor != 0:
	# Forward upsample size to force interpolation output size.
	forward_upsample_size = True
	break

	# ensure attention_mask is a bias, and give it a singleton query_tokens dimension
	# expects mask of shape:
	# [batch, key_tokens]
	# adds singleton query_tokens dimension:
	# [batch, 1, key_tokens]
	# this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
	# [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
	# [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
	if attention_mask is not None:
	# assume that mask is expressed as:
	# (1 = keep, 0 = discard)
	# convert mask into a bias that can be added to attention scores:
	# (keep = +0, discard = -10000.0)
	attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
	attention_mask = attention_mask.unsqueeze(1)

	# convert encoder_attention_mask to a bias the same way we do for attention_mask
	if encoder_attention_mask is not None:
	encoder_attention_mask = (
	1 - encoder_attention_mask.to(sample.dtype)
	) * -10000.0
	encoder_attention_mask = encoder_attention_mask.unsqueeze(1)

	# 0. center input if necessary
	if self.config.center_input_sample:
	sample = 2 * sample - 1.0

	# 1. time
	timesteps = timestep
	if not torch.is_tensor(timesteps):
	# TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
	# This would be a good case for the `match` statement (Python 3.10+)
	is_mps = sample.device.type == "mps"
	if isinstance(timestep, float):
	dtype = torch.float32 if is_mps else torch.float64
	else:
	dtype = torch.int32 if is_mps else torch.int64
	timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
	elif len(timesteps.shape) == 0:
	timesteps = timesteps[None].to(sample.device)

	# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
	timesteps = timesteps.expand(sample.shape[0])

	t_emb = self.time_proj(timesteps)

	# `Timesteps` does not contain any weights and will always return f32 tensors
	# but time_embedding might actually be running in fp16. so we need to cast here.
	# there might be better ways to encapsulate this.
	t_emb = t_emb.to(dtype=sample.dtype)

	emb = self.time_embedding(t_emb, timestep_cond)
	aug_emb = None

	if self.class_embedding is not None:
	if class_labels is None:
	raise ValueError(
	"class_labels should be provided when num_class_embeds > 0"
	)

	if self.config.class_embed_type == "timestep":
	class_labels = self.time_proj(class_labels)

	# `Timesteps` does not contain any weights and will always return f32 tensors
	# there might be better ways to encapsulate this.
	class_labels = class_labels.to(dtype=sample.dtype)

	class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)

	if self.config.class_embeddings_concat:
	emb = torch.cat([emb, class_emb], dim=-1)
	else:
	emb = emb + class_emb

	if self.config.addition_embed_type == "text":
	aug_emb = self.add_embedding(encoder_hidden_states)
	elif self.config.addition_embed_type == "text_image":
	# Kandinsky 2.1 - style
	if "image_embeds" not in added_cond_kwargs:
	raise ValueError(
	f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
	)

	image_embs = added_cond_kwargs.get("image_embeds")
	text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
	aug_emb = self.add_embedding(text_embs, image_embs)
	elif self.config.addition_embed_type == "text_time":
	# SDXL - style
	if "text_embeds" not in added_cond_kwargs:
	raise ValueError(
	f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
	)
	text_embeds = added_cond_kwargs.get("text_embeds")
	if "time_ids" not in added_cond_kwargs:
	raise ValueError(
	f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
	)
	time_ids = added_cond_kwargs.get("time_ids")
	time_embeds = self.add_time_proj(time_ids.flatten())
	time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
	add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
	add_embeds = add_embeds.to(emb.dtype)
	aug_emb = self.add_embedding(add_embeds)
	elif self.config.addition_embed_type == "image":
	# Kandinsky 2.2 - style
	if "image_embeds" not in added_cond_kwargs:
	raise ValueError(
	f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
	)
	image_embs = added_cond_kwargs.get("image_embeds")
	aug_emb = self.add_embedding(image_embs)
	elif self.config.addition_embed_type == "image_hint":
	# Kandinsky 2.2 - style
	if (
	"image_embeds" not in added_cond_kwargs
	or "hint" not in added_cond_kwargs
	):
	raise ValueError(
	f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
	)
	image_embs = added_cond_kwargs.get("image_embeds")
	hint = added_cond_kwargs.get("hint")
	aug_emb, hint = self.add_embedding(image_embs, hint)
	sample = torch.cat([sample, hint], dim=1)

	emb = emb + aug_emb if aug_emb is not None else emb

	if self.time_embed_act is not None:
	emb = self.time_embed_act(emb)

	if (
	self.encoder_hid_proj is not None
	and self.config.encoder_hid_dim_type == "text_proj"
	):
	encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
	elif (
	self.encoder_hid_proj is not None
	and self.config.encoder_hid_dim_type == "text_image_proj"
	):
	# Kadinsky 2.1 - style
	if "image_embeds" not in added_cond_kwargs:
	raise ValueError(
	f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
	)

	image_embeds = added_cond_kwargs.get("image_embeds")
	encoder_hidden_states = self.encoder_hid_proj(
	encoder_hidden_states, image_embeds
	)
	elif (
	self.encoder_hid_proj is not None
	and self.config.encoder_hid_dim_type == "image_proj"
	):
	# Kandinsky 2.2 - style
	if "image_embeds" not in added_cond_kwargs:
	raise ValueError(
	f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
	)
	image_embeds = added_cond_kwargs.get("image_embeds")
	encoder_hidden_states = self.encoder_hid_proj(image_embeds)
	elif (
	self.encoder_hid_proj is not None
	and self.config.encoder_hid_dim_type == "ip_image_proj"
	):
	if "image_embeds" not in added_cond_kwargs:
	raise ValueError(
	f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
	)
	image_embeds = added_cond_kwargs.get("image_embeds")
	image_embeds = self.encoder_hid_proj(image_embeds).to(
	encoder_hidden_states.dtype
	)
	encoder_hidden_states = torch.cat(
	[encoder_hidden_states, image_embeds], dim=1
	)

	# need_self_attn_block_embs
	# 初始化
	# 或在unet中运算中会不断 append self_attn_blocks_embs，用完需要清理，
	if self.need_self_attn_block_embs:
	self_attn_block_embs = [None] * self.self_attn_num
	else:
	self_attn_block_embs = None
	# 2. pre-process
	sample = self.conv_in(sample)
	if self.print_idx == 0:
	logger.debug(f"after conv in sample={sample.mean()}")
	# 2.5 GLIGEN position net
	if (
	cross_attention_kwargs is not None
	and cross_attention_kwargs.get("gligen", None) is not None
	):
	cross_attention_kwargs = cross_attention_kwargs.copy()
	gligen_args = cross_attention_kwargs.pop("gligen")
	cross_attention_kwargs["gligen"] = {
	"objs": self.position_net(**gligen_args)
	}

	# 3. down
	lora_scale = (
	cross_attention_kwargs.get("scale", 1.0)
	if cross_attention_kwargs is not None
	else 1.0
	)
	if USE_PEFT_BACKEND:
	# weight the lora layers by setting `lora_scale` for each PEFT layer
	scale_lora_layers(self, lora_scale)

	is_controlnet = (
	mid_block_additional_residual is not None
	and down_block_additional_residuals is not None
	)
	# using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
	is_adapter = down_intrablock_additional_residuals is not None
	# maintain backward compatibility for legacy usage, where
	# T2I-Adapter and ControlNet both use down_block_additional_residuals arg
	# but can only use one or the other
	if (
	not is_adapter
	and mid_block_additional_residual is None
	and down_block_additional_residuals is not None
	):
	deprecate(
	"T2I should not use down_block_additional_residuals",
	"1.3.0",
	"Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
	and will be removed in diffusers 1.3.0. `down_block_additional_residuals` should only be used \
	for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
	standard_warn=False,
	)
	down_intrablock_additional_residuals = down_block_additional_residuals
	is_adapter = True

	down_block_res_samples = (sample,)
	for i_downsample_block, downsample_block in enumerate(self.down_blocks):
	if (
	hasattr(downsample_block, "has_cross_attention")
	and downsample_block.has_cross_attention
	):
	# For t2i-adapter CrossAttnDownBlock2D
	additional_residuals = {}
	if is_adapter and len(down_intrablock_additional_residuals) > 0:
	additional_residuals[
	"additional_residuals"
	] = down_intrablock_additional_residuals.pop(0)
	if self.print_idx == 0:
	logger.debug(
	f"downsample_block {i_downsample_block} sample={sample.mean()}"
	)
	sample, res_samples = downsample_block(
	hidden_states=sample,
	temb=emb,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	cross_attention_kwargs=cross_attention_kwargs,
	encoder_attention_mask=encoder_attention_mask,
	**additional_residuals,
	self_attn_block_embs=self_attn_block_embs,
	)
	else:
	sample, res_samples = downsample_block(
	hidden_states=sample,
	temb=emb,
	scale=lora_scale,
	self_attn_block_embs=self_attn_block_embs,
	)
	if is_adapter and len(down_intrablock_additional_residuals) > 0:
	sample += down_intrablock_additional_residuals.pop(0)

	down_block_res_samples += res_samples

	if is_controlnet:
	new_down_block_res_samples = ()

	for down_block_res_sample, down_block_additional_residual in zip(
	down_block_res_samples, down_block_additional_residuals
	):
	down_block_res_sample = (
	down_block_res_sample + down_block_additional_residual
	)
	new_down_block_res_samples = new_down_block_res_samples + (
	down_block_res_sample,
	)

	down_block_res_samples = new_down_block_res_samples

	# update code start
	def reshape_return_emb(tmp_emb):
	if return_ndim == 4:
	return tmp_emb
	elif return_ndim == 5:
	return rearrange(tmp_emb, "(b t) c h w-> b c t h w", t=num_frames)
	else:
	raise ValueError(
	f"reshape_emb only support 4, 5 but given {return_ndim}"
	)

	if self.need_block_embs:
	return_down_block_res_samples = [
	reshape_return_emb(tmp_emb) for tmp_emb in down_block_res_samples
	]
	else:
	return_down_block_res_samples = None
	# update code end

	# 4. mid
	if self.mid_block is not None:
	if (
	hasattr(self.mid_block, "has_cross_attention")
	and self.mid_block.has_cross_attention
	):
	sample = self.mid_block(
	sample,
	emb,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	cross_attention_kwargs=cross_attention_kwargs,
	encoder_attention_mask=encoder_attention_mask,
	self_attn_block_embs=self_attn_block_embs,
	)
	else:
	sample = self.mid_block(sample, emb)

	# To support T2I-Adapter-XL
	if (
	is_adapter
	and len(down_intrablock_additional_residuals) > 0
	and sample.shape == down_intrablock_additional_residuals[0].shape
	):
	sample += down_intrablock_additional_residuals.pop(0)

	if is_controlnet:
	sample = sample + mid_block_additional_residual

	if self.need_block_embs:
	return_mid_block_res_samples = reshape_return_emb(sample)
	logger.debug(
	f"return_mid_block_res_samples, is_leaf={return_mid_block_res_samples.is_leaf}, requires_grad={return_mid_block_res_samples.requires_grad}"
	)
	else:
	return_mid_block_res_samples = None

	if self.up_blocks is not None:
	# update code end

	# 5. up
	for i, upsample_block in enumerate(self.up_blocks):
	is_final_block = i == len(self.up_blocks) - 1

	res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
	down_block_res_samples = down_block_res_samples[
	: -len(upsample_block.resnets)
	]

	# if we have not reached the final block and need to forward the
	# upsample size, we do it here
	if not is_final_block and forward_upsample_size:
	upsample_size = down_block_res_samples[-1].shape[2:]

	if (
	hasattr(upsample_block, "has_cross_attention")
	and upsample_block.has_cross_attention
	):
	sample = upsample_block(
	hidden_states=sample,
	temb=emb,
	res_hidden_states_tuple=res_samples,
	encoder_hidden_states=encoder_hidden_states,
	cross_attention_kwargs=cross_attention_kwargs,
	upsample_size=upsample_size,
	attention_mask=attention_mask,
	encoder_attention_mask=encoder_attention_mask,
	self_attn_block_embs=self_attn_block_embs,
	)
	else:
	sample = upsample_block(
	hidden_states=sample,
	temb=emb,
	res_hidden_states_tuple=res_samples,
	upsample_size=upsample_size,
	scale=lora_scale,
	self_attn_block_embs=self_attn_block_embs,
	)

	# update code start
	if self.need_block_embs or self.need_self_attn_block_embs:
	if self_attn_block_embs is not None:
	self_attn_block_embs = [
	reshape_return_emb(tmp_emb=tmp_emb)
	for tmp_emb in self_attn_block_embs
	]
	self.print_idx += 1
	return (
	return_down_block_res_samples,
	return_mid_block_res_samples,
	self_attn_block_embs,
	)

	if not self.need_block_embs and not self.need_self_attn_block_embs:
	# 6. post-process
	if self.conv_norm_out:
	sample = self.conv_norm_out(sample)
	sample = self.conv_act(sample)
	sample = self.conv_out(sample)

	if USE_PEFT_BACKEND:
	# remove `lora_scale` from each PEFT layer
	unscale_lora_layers(self, lora_scale)
	self.print_idx += 1
	if not return_dict:
	return (sample,)

	return UNet2DConditionOutput(sample=sample)

	def insert_spatial_self_attn_idx(self):
	attns, basic_transformers = self.spatial_self_attns
	self.self_attn_num = len(attns)
	for i, (name, layer) in enumerate(attns):
	logger.debug(f"{self.__class__.__name__}, {i}, {name}, {type(layer)}")
	if layer is not None:
	layer.spatial_self_attn_idx = i
	for i, (name, layer) in enumerate(basic_transformers):
	logger.debug(f"{self.__class__.__name__}, {i}, {name}, {type(layer)}")
	if layer is not None:
	layer.spatial_self_attn_idx = i

	@property
	def spatial_self_attns(
	self,
	) -> List[Tuple[str, Attention]]:
	attns, spatial_transformers = self.get_self_attns(
	include="attentions", exclude="temp_attentions"
	)
	attns = sorted(attns)
	spatial_transformers = sorted(spatial_transformers)
	return attns, spatial_transformers

	def get_self_attns(
	self, include: str = None, exclude: str = None
	) -> List[Tuple[str, Attention]]:
	r"""
	Returns:
	`dict` of attention attns: A dictionary containing all attention attns used in the model with
	indexed by its weight name.
	"""
	# set recursively
	attns = []
	spatial_transformers = []

	def fn_recursive_add_attns(
	name: str,
	module: torch.nn.Module,
	attns: List[Tuple[str, Attention]],
	spatial_transformers: List[Tuple[str, BasicTransformerBlock]],
	):
	is_target = False
	if isinstance(module, BasicTransformerBlock) and hasattr(module, "attn1"):
	is_target = True
	if include is not None:
	is_target = include in name
	if exclude is not None:
	is_target = exclude not in name
	if is_target:
	attns.append([f"{name}.attn1", module.attn1])
	spatial_transformers.append([f"{name}", module])
	for sub_name, child in module.named_children():
	fn_recursive_add_attns(
	f"{name}.{sub_name}", child, attns, spatial_transformers
	)

	return attns

	for name, module in self.named_children():
	fn_recursive_add_attns(name, module, attns, spatial_transformers)

	return attns, spatial_transformers


	class ReferenceNet3D(UNet3DConditionModel):
	"""继承 UNet3DConditionModel，用于提取中间emb用于后续作用。
	Inherit Unet3DConditionModel, used to extract the middle emb for subsequent actions.
	Args:
	UNet3DConditionModel (_type_): _description_
	"""

	pass