HunyuanDiT-finetune

App Files Files Community

HunyuanDiT-finetune / hydit /modules /embedders.py

multimodalart HF staff

Upload 57 files

8a09a62 verified 5 months ago

raw

history blame

4.09 kB

	import math
	import torch
	import torch.nn as nn
	from einops import repeat

	from timm.models.layers import to_2tuple


	class PatchEmbed(nn.Module):
	""" 2D Image to Patch Embedding

	Image to Patch Embedding using Conv2d

	A convolution based approach to patchifying a 2D image w/ embedding projection.

	Based on the impl in https://github.com/google-research/vision_transformer

	Hacked together by / Copyright 2020 Ross Wightman

	Remove the _assert function in forward function to be compatible with multi-resolution images.
	"""
	def __init__(
	self,
	img_size=224,
	patch_size=16,
	in_chans=3,
	embed_dim=768,
	norm_layer=None,
	flatten=True,
	bias=True,
	):
	super().__init__()
	if isinstance(img_size, int):
	img_size = to_2tuple(img_size)
	elif isinstance(img_size, (tuple, list)) and len(img_size) == 2:
	img_size = tuple(img_size)
	else:
	raise ValueError(f"img_size must be int or tuple/list of length 2. Got {img_size}")
	patch_size = to_2tuple(patch_size)
	self.img_size = img_size
	self.patch_size = patch_size
	self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
	self.num_patches = self.grid_size[0] * self.grid_size[1]
	self.flatten = flatten

	self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
	self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

	def update_image_size(self, img_size):
	self.img_size = img_size
	self.grid_size = (img_size[0] // self.patch_size[0], img_size[1] // self.patch_size[1])
	self.num_patches = self.grid_size[0] * self.grid_size[1]

	def forward(self, x):
	# B, C, H, W = x.shape
	# _assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
	# _assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
	x = self.proj(x)
	if self.flatten:
	x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
	x = self.norm(x)
	return x


	def timestep_embedding(t, dim, max_period=10000, repeat_only=False):
	"""
	Create sinusoidal timestep embeddings.
	:param t: a 1-D Tensor of N indices, one per batch element.
	These may be fractional.
	:param dim: the dimension of the output.
	:param max_period: controls the minimum frequency of the embeddings.
	:return: an (N, D) Tensor of positional embeddings.
	"""
	# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
	if not repeat_only:
	half = dim // 2
	freqs = torch.exp(
	-math.log(max_period)
	* torch.arange(start=0, end=half, dtype=torch.float32)
	/ half
	).to(device=t.device) # size: [dim/2], 一个指数衰减的曲线
	args = t[:, None].float() * freqs[None]
	embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
	if dim % 2:
	embedding = torch.cat(
	[embedding, torch.zeros_like(embedding[:, :1])], dim=-1
	)
	else:
	embedding = repeat(t, "b -> b d", d=dim)
	return embedding


	class TimestepEmbedder(nn.Module):
	"""
	Embeds scalar timesteps into vector representations.
	"""
	def __init__(self, hidden_size, frequency_embedding_size=256, out_size=None):
	super().__init__()
	if out_size is None:
	out_size = hidden_size
	self.mlp = nn.Sequential(
	nn.Linear(frequency_embedding_size, hidden_size, bias=True),
	nn.SiLU(),
	nn.Linear(hidden_size, out_size, bias=True),
	)
	self.frequency_embedding_size = frequency_embedding_size

	def forward(self, t):
	t_freq = timestep_embedding(t, self.frequency_embedding_size).type(self.mlp[0].weight.dtype)
	t_emb = self.mlp(t_freq)
	return t_emb