FlowMDM / model /FlowMDM.py

Upload 165 files

899324d verified 10 months ago

8.31 kB

	import numpy as np
	import torch
	import torch.nn as nn
	from model.rotation2xyz import Rotation2xyz
	from model.MDM import InputProcess, OutputProcess
	from model.base_models import TextConditionalModel
	from model.x_transformers.x_transformers import ContinuousTransformerWrapper, Encoder


	class BPE_Schedule():
	def __init__(self, training_rate: float, inference_step: int, max_steps: int) -> None:
	assert training_rate >= 0 and training_rate <= 1, "training_rate must be between 0 and 1"
	assert inference_step == -1 or (inference_step >= 0 and inference_step <= max_steps), "inference_step must be between 0 and max_steps"
	self.training_rate = training_rate
	self.inference_step = inference_step
	self.max_steps = max_steps
	self.last_random = None

	def step(self, t: torch.Tensor, training: bool):
	self.last_random = torch.rand(t.shape[0], device=t.device)

	def get_schedule_fn(self, t: torch.Tensor, training: bool) -> torch.Tensor:
	# False --> absolute
	# True --> relative
	if training: # at TRAINING: then random dropout
	return self.last_random < self.training_rate
	# at INFERENCE: step function as BPE schedule
	elif self.inference_step == -1: # --> all denoising chain with APE (absolute)
	return torch.zeros_like(t, dtype=torch.bool)
	elif self.inference_step == 0: # --> all denoising chain with RPE (relative)
	return torch.ones_like(t, dtype=torch.bool)
	else: # --> BPE with binary step function. Step from APE to RPE at "self.inference_step"
	return ~(t > self.max_steps - self.inference_step)

	def use_bias(self, t: torch.Tensor, training: bool) -> torch.Tensor:
	# function that returns True if we should use the absolute bias (only when using multi-segments inference)
	assert (t[0] == t).all(), "Bias from mixed schedule only supported when using same timestep for all batch elements: " + str(t)
	return ~self.get_schedule_fn(t[0], training) # if APE --> use bias to limit attention to the each subsequence

	def get_time_weights(self, t: torch.Tensor, training: bool) -> torch.Tensor:
	# 0 --> absolute
	# 1 --> relative
	return self.get_schedule_fn(t, training).to(torch.int32)


	class FlowMDM(TextConditionalModel):
	def __init__(self, njoints, nfeats, translation, pose_rep, glob, glob_rot,
	latent_dim=256, ff_size=1024, num_layers=8, num_heads=4, dropout=0.1,
	data_rep='rot6d', dataset='babel',
	clip_dim=512, clip_version=None, cond_mode="no_cond", cond_mask_prob=0.,
	**kargs):
	super().__init__(latent_dim=latent_dim, cond_mode=cond_mode, cond_mask_prob=cond_mask_prob, dropout=dropout, clip_dim=clip_dim, clip_version=clip_version)
	self.njoints = njoints
	self.nfeats = nfeats
	self.data_rep = data_rep
	self.dataset = dataset

	self.pose_rep = pose_rep
	self.glob = glob
	self.glob_rot = glob_rot
	self.translation = translation

	self.latent_dim = latent_dim

	self.ff_size = ff_size
	self.num_layers = num_layers
	self.num_heads = num_heads
	self.dropout = dropout

	self.input_feats = self.njoints * self.nfeats
	self.max_seq_att = kargs.get('max_seq_att', 1024)
	self.input_process = InputProcess(self.data_rep, self.input_feats, self.latent_dim)
	self.process_cond_input = [nn.Linear(2*self.latent_dim, self.latent_dim) for _ in range(self.num_layers)]

	print(f"FlowMDM init")
	self.use_chunked_att = kargs.get('use_chunked_att', False)
	bpe_training_rate = kargs.get('bpe_training_ratio', 0.5) # for training, we dropout with prob 50% --> APE vs RPE
	bpe_inference_step = kargs.get('bpe_denoising_step', None)
	diffusion_steps = kargs.get('diffusion_steps', None)
	self.bpe_schedule = BPE_Schedule(bpe_training_rate, bpe_inference_step, diffusion_steps)
	ws = kargs.get('rpe_horizon', -1) # Max attention horizon
	self.local_attn_window_size = 200 if ws == -1 else ws
	print("[Training] RPE/APE rate:", bpe_training_rate)
	print(f"[Inference] BPE switch from APE to RPE at denoising step {bpe_inference_step}/{diffusion_steps}.")
	print("Local attention window size:", self.local_attn_window_size)

	self.seqTransEncoder = ContinuousTransformerWrapper(
	dim_in = self.latent_dim, dim_out = self.latent_dim,
	emb_dropout = self.dropout,
	max_seq_len = self.max_seq_att,
	use_abs_pos_emb = True,
	absolute_bpe_schedule = self.bpe_schedule, # bpe schedule for absolute embeddings (APE)
	attn_layers = Encoder(
	dim = self.latent_dim,
	depth = self.num_layers,
	heads = self.num_heads,
	ff_mult = int(np.round(self.ff_size / self.latent_dim)), # 2 for MDM hyper params
	layer_dropout = self.dropout, cross_attn_tokens_dropout = 0,

	# ======== FLOWMDM ========
	custom_layers=('A', 'f'), # A --> PCCAT
	custom_query_fn = self.process_cond_input, # function that merges the condition into the query --> PCCAT dense layer (see Fig. 3)
	attn_max_attend_past = self.local_attn_window_size,
	attn_max_attend_future = self.local_attn_window_size,
	# ======== RELATIVE POSITIONAL EMBEDDINGS ========
	rotary_pos_emb = True, # rotary embeddings
	rotary_bpe_schedule = self.bpe_schedule, # bpe schedule for rotary embeddings (RPE)
	)
	)

	self.output_process = OutputProcess(self.data_rep, self.input_feats, self.latent_dim, self.njoints,
	self.nfeats)
	self.rot2xyz = Rotation2xyz(device='cpu', dataset=self.dataset)

	def forward(self, x, timesteps, y):
	"""
	x: [batch_size, njoints, nfeats, max_frames], denoted x_t in the paper
	timesteps: [batch_size] (int)
	inside y: model_kwargs with mask, pe_bias, pos_pe_abs, conditions_mask. See DiffusionWrapper_FlowMDM.
	"""
	bs, njoints, nfeats, nframes = x.shape
	mask = (y['mask'].reshape((bs, nframes))[:, :nframes].to(x.device)).bool() # [bs, max_frames]

	self.bpe_schedule.step(timesteps, self.training) # update the BPE scheduler (decides either APE or RPE for each timestep)
	if self.training or self.bpe_schedule.use_bias(timesteps, self.training):
	pe_bias = y.get("pe_bias", None) # This is for limiting the attention to inside each conditioned subsequence. The BPE will decide if we use it or not depending on the dropout at training time.
	chunked_attn = False
	else: # when using RPE at inference --> we use the bias to limit the attention to the each subsequence
	pe_bias = None
	chunked_attn = self.use_chunked_att # faster attention for inference with RPE for very long sequences (see LongFormer paper for details)

	# store info needed for the relative PE --> rotary embedding
	rotary_kwargs = {'timesteps': timesteps, 'pos_pe_abs': y.get("pos_pe_abs", None), 'training': self.training, 'pe_bias': pe_bias }

	# ============== INPUT PROCESSING ==============
	emb = self.compute_embedding(x, timesteps, y)
	x = self.input_process(x) # [seqlen, bs, d]

	# ============== MAIN ARCHITECTURE ==============
	# APE or RPE is injected inside seqTransEncoder forward function
	x, emb = x.permute(1, 0, 2), emb.permute(1, 0, 2)
	output = self.seqTransEncoder(x, mask=mask, cond_tokens=emb, attn_bias=pe_bias, rotary_kwargs=rotary_kwargs, chunked_attn=chunked_attn) # [bs, seqlen, d]
	output = output.permute(1, 0, 2) # [seqlen, bs, d]

	# ============== OUTPUT PROCESSING ==============
	return self.output_process(output) # [bs, njoints, nfeats, nframes]


	def _apply(self, fn):
	super()._apply(fn)
	self.rot2xyz.smpl_model._apply(fn)


	def train(self, args, *kwargs):
	super().train(args, *kwargs)
	self.rot2xyz.smpl_model.train(args, *kwargs)