i3d

Paused

App Files Files Community

i3d / spar3d /models /diffusion /gaussian_diffusion.py

John6666

Upload 77 files

b572032 verified 5 months ago

raw

history blame contribute delete

20.5 kB

	# --------------------------------------------------------
	# Adapted from: https://github.com/openai/point-e
	# Licensed under the MIT License
	# Copyright (c) 2022 OpenAI

	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	# --------------------------------------------------------

	import math
	from typing import Any, Dict, Iterable, Optional, Sequence, Union

	import numpy as np
	import torch as th


	def sigmoid_schedule(t, start=-3, end=3, tau=0.6, clip_min=1e-9):
	def sigmoid(x):
	return 1 / (1 + np.exp(-x))

	v_start = sigmoid(start / tau)
	v_end = sigmoid(end / tau)
	output = sigmoid((t * (end - start) + start) / tau)
	output = (v_end - output) / (v_end - v_start)
	return np.clip(output, clip_min, 1.0)


	def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
	"""
	This is the deprecated API for creating beta schedules.

	See get_named_beta_schedule() for the new library of schedules.
	"""
	if beta_schedule == "linear":
	betas = np.linspace(
	beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
	)
	else:
	raise NotImplementedError(beta_schedule)
	assert betas.shape == (num_diffusion_timesteps,)
	return betas


	def get_named_beta_schedule(schedule_name, num_diffusion_timesteps, exp_p=12):
	"""
	Get a pre-defined beta schedule for the given name.

	The beta schedule library consists of beta schedules which remain similar
	in the limit of num_diffusion_timesteps.
	Beta schedules may be added, but should not be removed or changed once
	they are committed to maintain backwards compatibility.
	"""
	if schedule_name == "linear":
	# Linear schedule from Ho et al, extended to work for any number of
	# diffusion steps.
	scale = 1000 / num_diffusion_timesteps
	return get_beta_schedule(
	"linear",
	beta_start=scale * 0.0001,
	beta_end=scale * 0.02,
	num_diffusion_timesteps=num_diffusion_timesteps,
	)
	elif schedule_name == "cosine":
	return betas_for_alpha_bar(
	num_diffusion_timesteps,
	lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
	)
	elif schedule_name == "sigmoid":
	# Sigmoid schedule passed through betas_for_alpha_bar
	return betas_for_alpha_bar(
	num_diffusion_timesteps, lambda t: sigmoid_schedule(t)
	)
	else:
	raise NotImplementedError(f"unknown beta schedule: {schedule_name}")


	def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
	"""
	Create a beta schedule that discretizes the given alpha_t_bar function,
	which defines the cumulative product of (1-beta) over time from t = [0,1].

	:param num_diffusion_timesteps: the number of betas to produce.
	:param alpha_bar: a lambda that takes an argument t from 0 to 1 and
	produces the cumulative product of (1-beta) up to that
	part of the diffusion process.
	:param max_beta: the maximum beta to use; use values lower than 1 to
	prevent singularities.
	"""
	betas = []
	for i in range(num_diffusion_timesteps):
	t1 = i / num_diffusion_timesteps
	t2 = (i + 1) / num_diffusion_timesteps
	betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
	return np.array(betas)


	def space_timesteps(num_timesteps, section_counts):
	"""
	Create a list of timesteps to use from an original diffusion process,
	given the number of timesteps we want to take from equally-sized portions
	of the original process.
	For example, if there's 300 timesteps and the section counts are [10,15,20]
	then the first 100 timesteps are strided to be 10 timesteps, the second 100
	are strided to be 15 timesteps, and the final 100 are strided to be 20.
	:param num_timesteps: the number of diffusion steps in the original
	process to divide up.
	:param section_counts: either a list of numbers, or a string containing
	comma-separated numbers, indicating the step count
	per section. As a special case, use "ddimN" where N
	is a number of steps to use the striding from the
	DDIM paper.
	:return: a set of diffusion steps from the original process to use.
	"""
	if isinstance(section_counts, str):
	if section_counts.startswith("ddim"):
	desired_count = int(section_counts[len("ddim") :])
	for i in range(1, num_timesteps):
	if len(range(0, num_timesteps, i)) == desired_count:
	return set(range(0, num_timesteps, i))
	raise ValueError(
	f"cannot create exactly {num_timesteps} steps with an integer stride"
	)
	elif section_counts.startswith("exact"):
	res = set(int(x) for x in section_counts[len("exact") :].split(","))
	for x in res:
	if x < 0 or x >= num_timesteps:
	raise ValueError(f"timestep out of bounds: {x}")
	return res
	section_counts = [int(x) for x in section_counts.split(",")]
	size_per = num_timesteps // len(section_counts)
	extra = num_timesteps % len(section_counts)
	start_idx = 0
	all_steps = []
	for i, section_count in enumerate(section_counts):
	size = size_per + (1 if i < extra else 0)
	if size < section_count:
	raise ValueError(
	f"cannot divide section of {size} steps into {section_count}"
	)
	if section_count <= 1:
	frac_stride = 1
	else:
	frac_stride = (size - 1) / (section_count - 1)
	cur_idx = 0.0
	taken_steps = []
	for _ in range(section_count):
	taken_steps.append(start_idx + round(cur_idx))
	cur_idx += frac_stride
	all_steps += taken_steps
	start_idx += size
	return set(all_steps)


	def _extract_into_tensor(arr, timesteps, broadcast_shape):
	"""Extract values from a 1-D numpy array for a batch of indices."""
	res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
	while len(res.shape) < len(broadcast_shape):
	res = res[..., None]
	return res + th.zeros(broadcast_shape, device=timesteps.device)


	class GaussianDiffusion:
	"""
	Utilities for sampling from Gaussian diffusion models.
	"""

	def __init__(
	self,
	*,
	betas: Sequence[float],
	model_mean_type: str,
	model_var_type: str,
	channel_scales: Optional[np.ndarray] = None,
	channel_biases: Optional[np.ndarray] = None,
	):
	self.model_mean_type = model_mean_type
	self.model_var_type = model_var_type
	self.channel_scales = channel_scales
	self.channel_biases = channel_biases

	# Use float64 for accuracy
	betas = np.array(betas, dtype=np.float64)
	self.betas = betas
	assert len(betas.shape) == 1, "betas must be 1-D"
	assert (betas > 0).all() and (betas <= 1).all()

	self.num_timesteps = int(betas.shape[0])

	alphas = 1.0 - betas
	self.alphas_cumprod = np.cumprod(alphas, axis=0)
	self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])

	# calculations for diffusion q(x_t \| x_{t-1}) and others
	self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
	self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
	self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
	self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
	# calculations for posterior q(x_{t-1} \| x_t, x_0)
	self.posterior_variance = (
	betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
	)
	# below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
	self.posterior_log_variance_clipped = np.log(
	np.append(self.posterior_variance[1], self.posterior_variance[1:])
	)

	self.posterior_mean_coef1 = (
	betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
	)
	self.posterior_mean_coef2 = (
	(1.0 - self.alphas_cumprod_prev)
	* np.sqrt(alphas)
	/ (1.0 - self.alphas_cumprod)
	)

	def scale_channels(self, x: th.Tensor) -> th.Tensor:
	"""Apply channel-wise scaling."""
	if self.channel_scales is not None:
	x = x * th.from_numpy(self.channel_scales).to(x).reshape(
	[1, -1, ([1] (len(x.shape) - 2))]
	)
	if self.channel_biases is not None:
	x = x + th.from_numpy(self.channel_biases).to(x).reshape(
	[1, -1, ([1] (len(x.shape) - 2))]
	)
	return x

	def unscale_channels(self, x: th.Tensor) -> th.Tensor:
	"""Remove channel-wise scaling."""
	if self.channel_biases is not None:
	x = x - th.from_numpy(self.channel_biases).to(x).reshape(
	[1, -1, ([1] (len(x.shape) - 2))]
	)
	if self.channel_scales is not None:
	x = x / th.from_numpy(self.channel_scales).to(x).reshape(
	[1, -1, ([1] (len(x.shape) - 2))]
	)
	return x

	def unscale_out_dict(
	self, out: Dict[str, Union[th.Tensor, Any]]
	) -> Dict[str, Union[th.Tensor, Any]]:
	return {
	k: (self.unscale_channels(v) if isinstance(v, th.Tensor) else v)
	for k, v in out.items()
	}

	def q_posterior_mean_variance(self, x_start, x_t, t):
	"""
	Compute the mean and variance of the diffusion posterior:

	q(x_{t-1} \| x_t, x_0)

	"""
	assert x_start.shape == x_t.shape
	posterior_mean = (
	_extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
	+ _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
	)
	posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
	posterior_log_variance_clipped = _extract_into_tensor(
	self.posterior_log_variance_clipped, t, x_t.shape
	)
	assert (
	posterior_mean.shape[0]
	== posterior_variance.shape[0]
	== posterior_log_variance_clipped.shape[0]
	== x_start.shape[0]
	)
	return posterior_mean, posterior_variance, posterior_log_variance_clipped

	def p_mean_variance(
	self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None
	):
	"""
	Apply the model to get p(x_{t-1} \| x_t).
	"""
	if model_kwargs is None:
	model_kwargs = {}

	B, C = x.shape[:2]
	assert t.shape == (B,)

	# Direct prediction of eps
	model_output = model(x, t, **model_kwargs)
	if isinstance(model_output, tuple):
	model_output, prev_latent = model_output
	model_kwargs["prev_latent"] = prev_latent

	# Convert model output to mean and variance
	model_variance, model_log_variance = {
	# for fixedlarge, we set the initial (log-)variance like so
	# to get a better decoder log likelihood.
	"fixed_large": (
	np.append(self.posterior_variance[1], self.betas[1:]),
	np.log(np.append(self.posterior_variance[1], self.betas[1:])),
	),
	"fixed_small": (
	self.posterior_variance,
	self.posterior_log_variance_clipped,
	),
	}[self.model_var_type]
	model_variance = _extract_into_tensor(model_variance, t, x.shape)
	model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)

	def process_xstart(x):
	if denoised_fn is not None:
	x = denoised_fn(x)
	if clip_denoised:
	x = x.clamp(
	-self.channel_scales[0] * 0.67, self.channel_scales[0] * 0.67
	)
	x[:, 3:] = x[:, 3:].clamp(
	-self.channel_scales[3] * 0.5, self.channel_scales[3] * 0.5
	)
	return x
	return x

	if self.model_mean_type == "x_prev":
	pred_xstart = process_xstart(
	self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)
	)
	model_mean = model_output
	elif self.model_mean_type in ["x_start", "epsilon"]:
	if self.model_mean_type == "x_start":
	pred_xstart = process_xstart(model_output)
	else:
	pred_xstart = process_xstart(
	self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
	)
	model_mean, _, _ = self.q_posterior_mean_variance(
	x_start=pred_xstart, x_t=x, t=t
	)
	# print('p_mean_variance:', pred_xstart.min(), pred_xstart.max())
	else:
	raise NotImplementedError(self.model_mean_type)

	assert (
	model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
	)
	return {
	"mean": model_mean,
	"variance": model_variance,
	"log_variance": model_log_variance,
	"pred_xstart": pred_xstart,
	}

	def _predict_xstart_from_eps(self, x_t, t, eps):
	assert x_t.shape == eps.shape
	return (
	_extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
	- _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
	)

	def _predict_xstart_from_xprev(self, x_t, t, xprev):
	assert x_t.shape == xprev.shape
	return ( # (xprev - coef2*x_t) / coef1
	_extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev
	- _extract_into_tensor(
	self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape
	)
	* x_t
	)

	def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
	return (
	_extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
	- pred_xstart
	) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)

	def ddim_sample_loop_progressive(
	self,
	model,
	shape,
	noise=None,
	clip_denoised=True,
	denoised_fn=None,
	model_kwargs=None,
	device=None,
	progress=False,
	eta=0.0,
	):
	"""
	Use DDIM to sample from the model and yield intermediate samples.
	"""
	if device is None:
	device = next(model.parameters()).device
	assert isinstance(shape, (tuple, list))
	if noise is not None:
	img = noise
	else:
	img = th.randn(*shape, device=device)

	indices = list(range(self.num_timesteps))[::-1]

	if progress:
	from tqdm.auto import tqdm

	indices = tqdm(indices)

	for i in indices:
	t = th.tensor([i] * shape[0], device=device)
	with th.no_grad():
	out = self.ddim_sample(
	model,
	img,
	t,
	clip_denoised=clip_denoised,
	denoised_fn=denoised_fn,
	model_kwargs=model_kwargs,
	eta=eta,
	)
	yield self.unscale_out_dict(out)
	img = out["sample"]

	def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
	return (
	_extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
	- pred_xstart
	) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)

	def ddim_sample(
	self,
	model,
	x,
	t,
	clip_denoised=True,
	denoised_fn=None,
	model_kwargs=None,
	eta=0.0,
	):
	"""
	Sample x_{t-1} from the model using DDIM.
	"""
	out = self.p_mean_variance(
	model,
	x,
	t,
	clip_denoised=clip_denoised,
	denoised_fn=denoised_fn,
	model_kwargs=model_kwargs,
	)

	# Usually our model outputs epsilon, but we re-derive it
	# in case we used x_start or x_prev prediction.
	eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])

	alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
	alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
	sigma = (
	eta
	* th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
	* th.sqrt(1 - alpha_bar / alpha_bar_prev)
	)

	# Equation 12.
	noise = th.randn_like(x)
	mean_pred = (
	out["pred_xstart"] * th.sqrt(alpha_bar_prev)
	+ th.sqrt(1 - alpha_bar_prev - sigma*2) eps
	)
	nonzero_mask = (t != 0).float().view(-1, ([1] (len(x.shape) - 1)))
	sample = mean_pred + nonzero_mask * sigma * noise
	return {"sample": sample, "pred_xstart": out["pred_xstart"]}


	class SpacedDiffusion(GaussianDiffusion):
	"""
	A diffusion process which can skip steps in a base diffusion process.
	"""

	def __init__(self, use_timesteps: Iterable[int], **kwargs):
	self.use_timesteps = set(use_timesteps)
	self.timestep_map = []
	self.original_num_steps = len(kwargs["betas"])

	base_diffusion = GaussianDiffusion(**kwargs)
	last_alpha_cumprod = 1.0
	new_betas = []
	for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
	if i in self.use_timesteps:
	new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
	last_alpha_cumprod = alpha_cumprod
	self.timestep_map.append(i)
	kwargs["betas"] = np.array(new_betas)
	super().__init__(**kwargs)

	def p_mean_variance(self, model, args, *kwargs):
	return super().p_mean_variance(self._wrap_model(model), args, *kwargs)

	def _wrap_model(self, model):
	if isinstance(model, _WrappedModel):
	return model
	return _WrappedModel(model, self.timestep_map, self.original_num_steps)


	class _WrappedModel:
	"""Helper class to wrap models for SpacedDiffusion."""

	def __init__(self, model, timestep_map, original_num_steps):
	self.model = model
	self.timestep_map = timestep_map
	self.original_num_steps = original_num_steps

	def __call__(self, x, ts, **kwargs):
	map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
	new_ts = map_tensor[ts]
	return self.model(x, new_ts, **kwargs)