Delete models

Browse files

Files changed (3) hide show

models/diffusion.py +0 -312
models/dit.py +0 -369
models/ema.py +0 -97

models/diffusion.py DELETED Viewed

@@ -1,312 +0,0 @@
-import itertools
-import math
-import torch
-import numpy as np
-import pytorch_lightning as L
-import torchmetrics
-from dataclasses import dataclass
-import dit, ema
-import noise_schedule  # Assuming this is part of the MDLM repository
-LOG2 = math.log(2)
-@dataclass
-class Loss:
-    loss: torch.FloatTensor
-    nlls: torch.FloatTensor
-    token_mask: torch.FloatTensor
-class NLL(torchmetrics.MeanMetric):
-    pass
-class BPD(NLL):
-    def compute(self) -> torch.Tensor:
-        """Computes the bits per dimension.
-        Returns:
-          bpd
-        """
-        return self.mean_value / self.weight / LOG2
-class Perplexity(NLL):
-    def compute(self) -> torch.Tensor:
-        """Computes the Perplexity.
-        Returns:
-         Perplexity
-        """
-        return torch.exp(self.mean_value / self.weight)
-# Based on MDLM repo
-class Diffusion(L.LightningModule):
-    def __init__(self, config, latent_dim, tokenizer):
-        super().__init__()
-        self.config = config
-        self.latent_dim = latent_dim
-        self.tokenizer = tokenizer
-        self.backbone = dit.DIT(self.config, vocab_size=self.latent_dim)
-        self.T = self.config.T
-        self.subs_masking = self.config.SUBS_MASKING
-        self.antithetic_sampling = self.config.Training.ANTITHETIC_SAMPLING
-        self.mask_index = self.tokenizer.mask_token_id
-        self.softplus = torch.nn.Softplus()
-        metrics = torchmetrics.MetricCollection({
-            'nll': NLL(),
-            'bpd': BPD(),
-            'ppl': Perplexity(),
-        })
-        metrics.set_dtype(torch.float64)
-        self.train_metrics = metrics.clone(prefix='train/')
-        self.valid_metrics = metrics.clone(prefix='val/')
-        self.test_metrics = metrics.clone(prefix='test/')
-        self.noise = noise_schedule.get_noise(self.config, dtype=self.dtype)
-        self.lr = self.config.Optim.LR
-        self.sampling_eps = self.config.Training.SAMPLING_EPS
-        self.time_conditioning = self.config.TIME_CONDITIONING
-        self.neg_infinity = -1000000.0
-    ############ FORWARD DIFFUSION #########
-    def subs_parameterization(self, logits, noised_latents):
-        # log prob at the mask index = - infinity
-        logits[:, :, self.mask_index] += self.neg_infinity
-        # Normalize the logits such that x.exp() is
-        # a probability distribution over vocab_size.
-        logits = logits - torch.logsumexp(logits, dim=-1,
-                                        keepdim=True)
-        # Apply updates directly in the logits matrix.
-        # For the logits of the unmasked tokens, set all values
-        # to -infinity except for the indices corresponding to
-        # the unmasked tokens.
-        unmasked_indices = (noised_latents != self.mask_index)
-        logits[unmasked_indices] = self.neg_infinity
-        logits[unmasked_indices, noised_latents[unmasked_indices]] = 0
-        return logits
-    def forward(self, latents, sigma):
-        latents = latents.long()
-        with torch.cuda.amp.autocast(dtype=torch.float32):
-            logits = self.backbone(latents, sigma)
-        print(logits)
-        optimized_logits = self.subs_parameterization(logits, latents)
-        return optimized_logits
-    def q_xt(self, latents, move_chance):
-        """
-        Computes the noisy sample xt.
-        Args:
-            x: int torch.Tensor with shape (batch_size, diffusion_model_input_length), input.
-            move_chance: float torch.Tensor with shape (batch_size, 1).
-        """
-        latents = latents.mean(dim=1) # [bsz x seq_len x 1280] --> [bsz x 1280] as per args
-        move_indices = torch.rand(* latents.shape, device=latents.device) < move_chance
-        noised_latents = torch.where(move_indices, self.mask_index, latents)
-        return noised_latents
-    def sample_timestep(self, n, device):
-        _eps_t = torch.rand(n, device=device)
-        if self.antithetic_sampling:
-            offset = torch.arange(n, device=device) / n
-            _eps_t = (_eps_t / n + offset) % 1
-        t = (1 - self.sampling_eps) * _eps_t + self.sampling_eps
-        # if self.importance_sampling:
-        #     return self.noise.importance_sampling_transformation(t)
-        return t
-    def d3pm_loss(self, model_output, xt, x0, t):
-        """Computes the D3PM loss between noisy latents and the original input at a given time step."""
-        dt = 1 / self.T
-        if torch.is_tensor(t):
-            t = t[:, None]
-            assert t.ndim == 2
-            t = t.clamp(0., 1. - 1e-4)
-        alpha_t = 1 - t + torch.zeros_like(xt)
-        alpha_s = 1 - (t - dt) + torch.zeros_like(xt)
-        x0 = x0.to(torch.int64)
-        log_x_theta_at_x0 = torch.gather(model_output, -1, x0[:, :, None]).squeeze(-1)
-        log_x_theta_at_m = model_output[:, :, self.mask_index]
-        x_theta_at_m = log_x_theta_at_m.exp()
-        term_1_coef = dt / t
-        term_1_log_nr = torch.log(alpha_t * x_theta_at_m / t + 1)
-        term_1_log_dr = log_x_theta_at_x0
-        term_2_coef = 1 - dt / t
-        term_2_log_nr = term_1_log_nr
-        term_2_log_dr = torch.log(alpha_s * x_theta_at_m / (t - dt) + 1)
-        L_vb_masked = (
-            term_1_coef * (term_1_log_nr - term_1_log_dr)
-            + term_2_coef * (term_2_log_nr - term_2_log_dr))
-        L_vb = L_vb_masked * (xt == self.mask_index)
-        return self.T * L_vb
-    def forward_diffusion(self, latents):
-        """Forward diffusion process, adds noise to the latents."""
-        t = self.sample_timestep(latents.shape[0], latents.device)
-        if self.T > 0:
-            t = (t * self.T).to(torch.int)
-            t = t / self.T
-            # t \in {1/T, 2/T, ..., 1}
-            t += (1 / self.T)
-        sigma, dsigma = self.noise(t)
-        unet_conditioning = sigma[:, None]
-        move_chance = 1 - torch.exp(-sigma[:, None])
-        noised_latents = self.q_xt(latents, move_chance)
-        model_output = self.forward(noised_latents, unet_conditioning)
-        if self.T > 0:
-            diffusion_loss = self.d3pm_loss(model_output=model_output, xt=noised_latents, x0=latents, t=t)
-            return diffusion_loss
-        # SUBS parameterization, continuous time.
-        else:
-            log_p_theta = torch.gather(input=model_output, dim=-1, index=latents[:, :, None]).squeeze(-1)
-            return - log_p_theta * (dsigma / torch.expm1(sigma))[:, None]
-    ######### LOSS CALCULATIONS #########
-    def maybe_sub_sample(self, x0, attention_mask):
-        # seqlen = x0.shape[1]
-        # print(seqlen)
-        # if seqlen > self.config.model.length:
-        #     assert seqlen == 2 * self.config.model.length
-        #     # cropping is needed for text8-crop dataset
-        #     # try the same starting point for now
-        #     start = np.random.choice(self.config.model.length)
-        #     end = start + self.config.model.length
-        #     input_tokens = x0[:, start: end]
-        #     output_tokens = x0[:, start + 1: end + 1]
-        #     new_attention_mask = attention_mask[:, start: end]
-        #     # Helps with validation PPL, since the val
-        #     # examples will all start and end with BOS/EOS
-        #     input_tokens[:, 0] = self.tokenizer.bos_token_id
-        #     output_tokens[:, -1] = self.tokenizer.eos_token_id
-        # elif self.parameterization == 'ar':
-        #     input_tokens = x0[:, :-1]
-        #     output_tokens = x0[:, 1:]
-        #     new_attention_mask = attention_mask[:, 1:]
-        # else:
-        input_tokens = x0
-        output_tokens = None
-        new_attention_mask = attention_mask
-        return input_tokens, output_tokens, new_attention_mask
-    def compute_loss(self, latents, attention_mask):
-        """"Average of MLM losses to stabilize training"""
-        (input_tokens, output_tokens, attention_mask) = self.maybe_sub_sample(latents, attention_mask)
-        loss = self.forward_diffusion(input_tokens)
-        nlls = loss * attention_mask
-        count = attention_mask.sum()
-        batch_nll = nlls.sum()
-        token_nll = batch_nll / count
-        return Loss(loss=token_nll, nlls=nlls, token_mask=attention_mask)
-    ######### TRAINING #########
-    def training_step(self, batch, batch_idx):
-        latents, attention_mask = batch
-        loss = self.compute_loss(latents, attention_mask)
-        return loss
-    def configure_optimizers(self):
-        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
-        return optimizer
-    def validation_step(self, batch):
-        latents, attention_mask = batch
-        loss = self.compute_loss(latents, attention_mask)
-        return loss
-    ######### GENERATION #########
-    def sample_prior(self, *batch_dims):
-        return self.mask_index * torch.ones(* batch_dims, dtype=torch.int64)
-    def sample_categorical(categorical_probs):
-        gumbel_norm = (1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log())
-        return (categorical_probs / gumbel_norm).argmax(dim=-1)
-    def ddpm_caching_update(self, x, t, dt, p_x0=None):
-        assert self.config.noise.type == 'loglinear'
-        sigma_t, _ = self.noise(t)
-        if t.ndim > 1:
-            t = t.squeeze(-1)
-        assert t.ndim == 1
-        move_chance_t = t[:, None, None]
-        move_chance_s = (t - dt)[:, None, None]
-        assert move_chance_t.ndim == 3, move_chance_t.shape
-        if p_x0 is None:
-            p_x0 = self.forward(x, sigma_t).exp()
-        assert move_chance_t.ndim == p_x0.ndim
-        q_xs = p_x0 * (move_chance_t - move_chance_s)
-        q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
-        _x = self.sample_categorical(q_xs)
-        copy_flag = (x != self.mask_index).to(x.dtype)
-        return p_x0, copy_flag * x + (1 - copy_flag) * _x
-    @torch.no_grad()
-    def sample_subs_guidance(self, n_samples, stride_length, num_strides, dt=0.001):
-        ones = torch.ones(n_samples, dtype=self.dtype,device=self.device)
-        num_steps = int(1 / dt)
-        sampling_steps = 0
-        intermediate_tokens = []
-        target = None
-        for _ in range(num_strides + 1):
-            p_x0_cache = None
-            x = self._sample_prior(n_samples,self.config.model.length).to(self.device)
-            if target is not None:
-                x[:, : -stride_length] = target
-            for i in range(num_steps + 1):
-                p_x0_cache, x_next = self.ddpm_caching_update(x=x, t=(1 - i * dt) * ones, dt=dt, p_x0=p_x0_cache)
-                if (not torch.allclose(x_next, x) or self.time_conditioning):
-                    p_x0_cache = None
-                    sampling_steps += 1
-                x = x_next
-            x = self.forward(x, 0 * ones).argmax(dim=-1)
-            intermediate_tokens.append(x[:, :stride_length].cpu().numpy())
-            target = x[:, stride_length:]
-        intermediate_tokens.append(target.cpu().numpy())
-        intermediate_text_samples = []
-        sequence_lengths = ((np.concatenate(intermediate_tokens, axis=1)[:, 1:]
-                                 == self.tokenizer.eos_token_id).cumsum(-1) == 0).sum(-1)
-        for i in range(2, len(intermediate_tokens) + 1):
-            intermediate_text_samples.append(self.tokenizer.decode(np.concatenate(intermediate_tokens[:i], axis=1)))
-        return (sampling_steps, intermediate_text_samples,
-            sequence_lengths)
-    def restore_model_and_semi_ar_sample(self, stride_length, num_strides, dt=0.001):
-        """Generate samples from the model."""
-        # Lightning auto-casting is not working in this method for some reason
-        self.backbone.eval()
-        self.noise.eval()
-        (sampling_steps, samples, sequence_lengths) = self.sample_subs_guidance(n_samples=self.config.Loader.BATCH_SIZE,stride_length=stride_length,num_strides=num_strides,dt=dt)
-        self.backbone.train()
-        self.noise.train()
-        return sampling_steps, samples, sequence_lengths

models/dit.py DELETED Viewed

@@ -1,369 +0,0 @@
-import math
-import typing
-import flash_attn
-import flash_attn.layers.rotary
-import huggingface_hub
-import omegaconf
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-# Flags required to enable jit fusion kernels
-torch._C._jit_set_profiling_mode(False)
-torch._C._jit_set_profiling_executor(False)
-torch._C._jit_override_can_fuse_on_cpu(True)
-torch._C._jit_override_can_fuse_on_gpu(True)
-def bias_dropout_add_scale(
-    x: torch.Tensor,
-    bias: typing.Optional[torch.Tensor],
-    scale: torch.Tensor,
-    residual: typing.Optional[torch.Tensor],
-    prob: float,
-    training: bool) -> torch.Tensor:
-  if bias is not None:
-    out = scale * F.dropout(x + bias, p=prob, training=training)
-  else:
-    out = scale * F.dropout(x, p=prob, training=training)
-  if residual is not None:
-    out = residual + out
-  return out
-def get_bias_dropout_add_scale(training):
-  def _bias_dropout_add(x, bias, scale, residual, prob):
-    return bias_dropout_add_scale(
-      x, bias, scale, residual, prob, training)
-  return _bias_dropout_add
-# function overload
-def modulate(x: torch.Tensor,
-             shift: torch.Tensor,
-             scale: torch.Tensor) -> torch.Tensor:
-  return x * (1 + scale) + shift
-@torch.jit.script
-def bias_dropout_add_scale_fused_train(
-    x: torch.Tensor,
-    bias: typing.Optional[torch.Tensor],
-    scale: torch.Tensor,
-    residual: typing.Optional[torch.Tensor],
-    prob: float) -> torch.Tensor:
-  return bias_dropout_add_scale(
-    x, bias, scale, residual, prob, True)
-@torch.jit.script
-def bias_dropout_add_scale_fused_inference(
-    x: torch.Tensor,
-    bias: typing.Optional[torch.Tensor],
-    scale: torch.Tensor,
-    residual: typing.Optional[torch.Tensor],
-    prob: float) -> torch.Tensor:
-  return bias_dropout_add_scale(
-    x, bias, scale, residual, prob, False)
-@torch.jit.script
-def modulate_fused(x: torch.Tensor,
-                   shift: torch.Tensor,
-                   scale: torch.Tensor) -> torch.Tensor:
-  return modulate(x, shift, scale)
-class Rotary(torch.nn.Module):
-  def __init__(self, dim, base=10_000):
-    super().__init__()
-    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
-    self.register_buffer('inv_freq', inv_freq)
-    self.seq_len_cached = None
-    self.cos_cached = None
-    self.sin_cached = None
-  def forward(self, x, seq_dim=1):
-    seq_len = x.shape[seq_dim]
-    if seq_len != self.seq_len_cached:
-      self.seq_len_cached = seq_len
-      t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq)
-      freqs = torch.einsum("i,j->ij", t, self.inv_freq.clone())
-      emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-      # dims are: batch, seq_len, qkv, head, dim
-      self.cos_cached = emb.cos()[None, :, None, None, :].repeat(1,1,3,1,1)
-      self.sin_cached = emb.sin()[None, :, None, None, :].repeat(1,1,3,1,1)
-      # This makes the transformation on v an identity.
-      self.cos_cached[:,:,2,:,:].fill_(1.)
-      self.sin_cached[:,:,2,:,:].fill_(0.)
-    return self.cos_cached, self.sin_cached
-def rotate_half(x):
-  x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
-  return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(qkv, cos, sin):
-  cos = cos[0,:,0,0,:cos.shape[-1]//2]
-  sin = sin[0,:,0,0,:sin.shape[-1]//2]
-  return flash_attn.layers.rotary.apply_rotary_emb_qkv_(qkv, cos, sin)
-# function overload
-def modulate(x, shift, scale):
-  return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-#################################################################################
-#                                  Layers                                       #
-#################################################################################
-class LayerNorm(nn.Module):
-  def __init__(self, dim):
-    super().__init__()
-    self.weight = nn.Parameter(torch.ones([dim]))
-    self.dim = dim
-  def forward(self, x):
-    with torch.cuda.amp.autocast(enabled=False):
-      x = F.layer_norm(x.float(), [self.dim])
-    return x * self.weight[None,None,:]
-def residual_linear(x, W, x_skip, residual_scale):
-  """x_skip + residual_scale * W @ x"""
-  dim_out, dim_in = W.shape[0], W.shape[1]
-  return torch.addmm(
-    x_skip.view(-1, dim_out),
-    x.view(-1, dim_in),
-    W.T,
-    alpha=residual_scale).view(*x.shape[:-1], dim_out)
-#################################################################################
-#               Embedding Layers for Timesteps and Class Labels                 #
-#################################################################################
-class TimestepEmbedder(nn.Module):
-  """
-  Embeds scalar timesteps into vector representations.
-  """
-  def __init__(self, hidden_size, frequency_embedding_size=256):
-    super().__init__()
-    self.mlp = nn.Sequential(
-      nn.Linear(frequency_embedding_size, hidden_size, bias=True),
-      nn.SiLU(),
-      nn.Linear(hidden_size, hidden_size, bias=True))
-    self.frequency_embedding_size = frequency_embedding_size
-  @staticmethod
-  def timestep_embedding(t, dim, max_period=10000):
-    """
-    Create sinusoidal timestep embeddings.
-    :param t: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param dim: the dimension of the output.
-    :param max_period: controls the minimum frequency of the embeddings.
-    :return: an (N, D) Tensor of positional embeddings.
-    """
-    # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
-    half = dim // 2
-    freqs = torch.exp(
-      - math.log(max_period)
-      * torch.arange(start=0, end=half, dtype=torch.float32)
-      / half).to(device=t.device)
-    args = t[:, None].float() * freqs[None]
-    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-    if dim % 2:
-      embedding = torch.cat(
-        [embedding,
-         torch.zeros_like(embedding[:, :1])], dim=-1)
-    return embedding
-  def forward(self, t):
-    t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
-    t_emb = self.mlp(t_freq)
-    return t_emb
-class LabelEmbedder(nn.Module):
-  """Embeds class labels into vector representations.
-  Also handles label dropout for classifier-free guidance.
-  """
-  def __init__(self, num_classes, cond_size):
-    super().__init__()
-    self.embedding_table = nn.Embedding(num_classes + 1, cond_size)
-    self.num_classes = num_classes
-    # TODO think of initializing with 0.02 std deviation like in original DiT paper
-  def forward(self, labels):
-    embeddings = self.embedding_table(labels)
-    return embeddings
-#################################################################################
-#                                 Core Model                                    #
-#################################################################################
-class DDiTBlock(nn.Module):
-  def __init__(self, dim, n_heads, cond_dim, mlp_ratio=4, dropout=0.1):
-    super().__init__()
-    self.n_heads = n_heads
-    self.norm1 = LayerNorm(dim)
-    self.attn_qkv = nn.Linear(dim, 3 * dim, bias=False)
-    self.attn_out = nn.Linear(dim, dim, bias=False)
-    self.dropout1 = nn.Dropout(dropout)
-    self.norm2 = LayerNorm(dim)
-    self.mlp = nn.Sequential(
-      nn.Linear(dim, mlp_ratio * dim, bias=True),
-      nn.GELU(approximate='tanh'),
-      nn.Linear(mlp_ratio * dim, dim, bias=True))
-    self.dropout2 = nn.Dropout(dropout)
-    self.dropout = dropout
-    self.adaLN_modulation = nn.Linear(cond_dim, 6 * dim, bias=True)
-    self.adaLN_modulation.weight.data.zero_()
-    self.adaLN_modulation.bias.data.zero_()
-  def _get_bias_dropout_scale(self):
-    if self.training:
-      return bias_dropout_add_scale_fused_train
-    else:
-      return bias_dropout_add_scale_fused_inference
-  def forward(self, x, rotary_cos_sin, c, seqlens=None):
-    batch_size, seq_len = x.shape[0], x.shape[1]
-    bias_dropout_scale_fn = self._get_bias_dropout_scale()
-    (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp) = self.adaLN_modulation(c)[:, None][0].chunk(6, dim=2)
-    # attention operation
-    x_skip = x
-    x = modulate_fused(self.norm1(x), shift_msa, scale_msa)
-    qkv = self.attn_qkv(x)
-    qkv = rearrange(qkv,
-                    'b s (three h d) -> b s three h d',
-                    three=3,
-                    h=self.n_heads)
-    with torch.cuda.amp.autocast(enabled=False):
-      cos, sin = rotary_cos_sin
-      qkv = apply_rotary_pos_emb(
-        qkv, cos.to(qkv.dtype), sin.to(qkv.dtype))
-    qkv = rearrange(qkv, 'b s ... -> (b s) ...')
-    if seqlens is None:
-      cu_seqlens = torch.arange(
-        0, (batch_size + 1) * seq_len, step=seq_len,
-        dtype=torch.int32, device=qkv.device)
-    else:
-      cu_seqlens = seqlens.cumsum(-1)
-    x = flash_attn.flash_attn_interface.flash_attn_varlen_qkvpacked_func(
-      qkv, cu_seqlens, seq_len, 0., causal=False)
-    x = rearrange(x, '(b s) h d -> b s (h d)', b=batch_size)
-    x = bias_dropout_scale_fn(self.attn_out(x),
-                              None,
-                              gate_msa,
-                              x_skip,
-                              self.dropout)
-    # mlp operation
-    x = bias_dropout_scale_fn(
-      self.mlp(modulate_fused(
-        self.norm2(x), shift_mlp, scale_mlp)),
-      None, gate_mlp, x, self.dropout)
-    return x
-class EmbeddingLayer(nn.Module):
-  def __init__(self, dim, vocab_dim):
-    super().__init__()
-    self.embedding = nn.Parameter(torch.empty((vocab_dim, dim)))
-    torch.nn.init.kaiming_uniform_(self.embedding, a=math.sqrt(5))
-  def forward(self, x):
-    return self.embedding[x]
-class DDitFinalLayer(nn.Module):
-  def __init__(self, hidden_size, out_channels, cond_dim):
-    super().__init__()
-    self.norm_final = LayerNorm(hidden_size)
-    self.linear = nn.Linear(hidden_size, out_channels)
-    self.linear.weight.data.zero_()
-    self.linear.bias.data.zero_()
-    self.adaLN_modulation = nn.Linear(cond_dim,
-                                      2 * hidden_size,
-                                      bias=True)
-    self.adaLN_modulation.weight.data.zero_()
-    self.adaLN_modulation.bias.data.zero_()
-  def forward(self, x, c):
-    shift, scale = self.adaLN_modulation(c)[:, None][0].chunk(2, dim=2)
-    x = modulate_fused(self.norm_final(x), shift, scale)
-    x = self.linear(x)
-    return x
-class DIT(nn.Module, huggingface_hub.PyTorchModelHubMixin):
-  def __init__(self, config, vocab_size: int):
-    super().__init__()
-    if type(config) == dict:
-      config = omegaconf.OmegaConf.create(config)
-    self.config = config
-    self.vocab_size = vocab_size
-    self.vocab_embed = EmbeddingLayer(config.model.hidden_size,
-                                      vocab_size)
-    self.sigma_map = TimestepEmbedder(config.model.cond_dim)
-    self.rotary_emb = Rotary(
-      config.model.hidden_size // config.model.n_heads)
-    blocks = []
-    for _ in range(config.model.n_blocks):
-      blocks.append(DDiTBlock(config.model.hidden_size,
-                              config.model.n_heads,
-                              config.model.cond_dim,
-                              dropout=config.model.dropout))
-    self.blocks = nn.ModuleList(blocks)
-    self.output_layer = DDitFinalLayer(
-      config.model.hidden_size,
-      vocab_size,
-      config.model.cond_dim)
-    #self.scale_by_sigma = config.model.scale_by_sigma
-  def _get_bias_dropout_scale(self):
-    if self.training:
-      return bias_dropout_add_scale_fused_train
-    else:
-      return  bias_dropout_add_scale_fused_inference
-  def forward(self, indices, sigma):
-    x = self.vocab_embed(indices)
-    c = F.silu(self.sigma_map(sigma))
-    rotary_cos_sin = self.rotary_emb(x)
-    with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-      for i in range(len(self.blocks)):
-        x = self.blocks[i](x, rotary_cos_sin, c, seqlens=None)
-      x = self.output_layer(x, c)
-    return x

models/ema.py DELETED Viewed

@@ -1,97 +0,0 @@
-import torch
-class ExponentialMovingAverage:
-  """
-  Maintains (exponential) moving average of a set of parameters.
-  """
-  def __init__(self, parameters, decay, use_num_updates=True):
-    """
-    Args:
-        parameters: Iterable of `torch.nn.Parameter`; usually the result of
-            `model.parameters()`.
-        decay: The exponential decay.
-        use_num_updates: Whether to use number of updates when computing
-            averages.
-    """
-    if decay < 0.0 or decay > 1.0:
-      raise ValueError('Decay must be between 0 and 1')
-    self.decay = decay
-    self.num_updates = 0 if use_num_updates else None
-    self.shadow_params = [p.clone().detach()
-                          for p in parameters if p.requires_grad]
-    self.collected_params = []
-  def move_shadow_params_to_device(self, device):
-    self.shadow_params = [i.to(device) for i in self.shadow_params]
-  def update(self, parameters):
-    """
-    Update currently maintained parameters.
-    Call this every time the parameters are updated, such as the result of
-    the `optimizer.step()` call.
-    Args:
-        parameters: Iterable of `torch.nn.Parameter`; usually the same set of
-            parameters used to initialize this object.
-    """
-    decay = self.decay
-    if self.num_updates is not None:
-      self.num_updates += 1
-      decay = min(decay, (1 + self.num_updates) /
-                  (10 + self.num_updates))
-    one_minus_decay = 1.0 - decay
-    with torch.no_grad():
-      parameters = [p for p in parameters if p.requires_grad]
-      for s_param, param in zip(self.shadow_params, parameters):
-        s_param.sub_(one_minus_decay * (s_param - param))
-  def copy_to(self, parameters):
-    """
-    Copy current parameters into given collection of parameters.
-    Args:
-        parameters: Iterable of `torch.nn.Parameter`; the parameters to be
-            updated with the stored moving averages.
-    """
-    parameters = [p for p in parameters if p.requires_grad]
-    for s_param, param in zip(self.shadow_params, parameters):
-      if param.requires_grad:
-        param.data.copy_(s_param.data)
-  def store(self, parameters):
-    """
-    Save the current parameters for restoring later.
-    Args:
-        parameters: Iterable of `torch.nn.Parameter`; the parameters to be
-            temporarily stored.
-    """
-    self.collected_params = [param.clone() for param in parameters]
-  def restore(self, parameters):
-    """
-    Restore the parameters stored with the `store` method.
-    Useful to validate the model with EMA parameters without affecting the
-    original optimization process. Store the parameters before the
-    `copy_to` method. After validation (or model saving), use this to
-    restore the former parameters.
-    Args:
-        parameters: Iterable of `torch.nn.Parameter`; the parameters to be
-            updated with the stored parameters.
-    """
-    for c_param, param in zip(self.collected_params, parameters):
-      param.data.copy_(c_param.data)
-  def state_dict(self):
-    return dict(decay=self.decay,
-                num_updates=self.num_updates,
-                shadow_params=self.shadow_params)
-  def load_state_dict(self, state_dict):
-    self.decay = state_dict['decay']
-    self.num_updates = state_dict['num_updates']
-    self.shadow_params = state_dict['shadow_params']