Upload 12 files

Browse files

Files changed (12) hide show

config.yaml +127 -0
diffusion.py +1434 -0
dit.py +388 -0
ema.py +97 -0
esm_utils.py +15 -0
generate.py +60 -0
main.py +250 -0
mdlm_motif_benchmarking.py +96 -0
mlm_generate_utils.py +108 -0
noise_schedule.py +153 -0
pl_data_loader.py +819 -0
utils.py +230 -0

config.yaml ADDED Viewed

	@@ -0,0 +1,127 @@

+defaults:
+  - _self_
+  - /callbacks: [checkpoint_every_n_steps, checkpoint_monitor, learning_rate_monitor]
+  - /model: small
+  - /strategy: ddp
+  - /noise: loglinear
+  - /lr_scheduler: constant_warmup
+mode: sample_eval  # train / ppl_eval / sample_eval
+diffusion: absorbing_state
+backbone: membrane_esm_finetune  # dit / dimamba / ar / vanilla_esm_pretrain / membrane_esm_finetune
+parameterization: subs  # subs / d3pm / sedd
+time_conditioning: False
+T: 0  # 0 (continuous time) / 1000
+subs_masking: False
+seed: 42
+data:
+  train:
+    vanilla_esm_train_path: /workspace/sg666/MDpLM/data/uniref50/200k_seqs/train.csv
+    membrane_esm_train_path: /workspace/sg666/MDpLM/data/membrane/train.csv
+    wrap: null
+  test:
+    vanilla_esm_test_path: /workspace/sg666/MDpLM/data/uniref50/200k_seqs/test.csv
+    membrane_esm_test_path: /workspace/sg666/MDpLM/data/membrane/test.csv
+    wrap: null
+  valid:
+    vanilla_esm_valid_path: /workspace/sg666/MDpLM/data/uniref50/200k_seqs/val.csv
+    membrane_esm_valid_path: /workspace/sg666/MDpLM/data/membrane/val.csv
+    wrap: null
+  wrapping: True
+loader:
+  global_batch_size: 8
+  eval_global_batch_size: ${.global_batch_size}
+  # Note: batch_size and eval_batch_size are **per machine**
+  batch_size: ${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
+  eval_batch_size: ${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
+  num_workers: ${eval:"len(__import__('os').sched_getaffinity(0))"}
+  pin_memory: True
+sampling:
+  predictor: ddpm_cache  # analytic, ddpm, ddpm_cache
+  steps: 128
+  noise_removal: True
+  # TODO(yair): @subham, why aren't these params under `eval`?
+  num_sample_batches: 2  # Total samples: `num_gpus` * `loader.eval_batch_size` * num_sample_batches
+  num_sample_log: 2
+  semi_ar: False
+  stride_length: 1
+  num_strides: 1
+training:
+  ema: 0.9999
+  antithetic_sampling: True
+  importance_sampling: False
+  sampling_eps: 1e-3
+  change_of_variables: False
+  mlm_model_path: /workspace/sg666/MDpLM/benchmarks/MLM/model_ckpts_650M/best_model_epoch
+  esm_model_path: facebook/esm2_t30_150M_UR50D
+  focus_mask: False
+eval:
+  checkpoint_path: /workspace/sg666/MDpLM/checkpoints/membrane_mdlm/eos-wrapping_epochs60_lr3e-4_200k-seqs_bsz16_all-params_no-compile_gradclip1_beta-one0.9_beta-two0.999_bf16/checkpoints/best.ckpt # Used to evaluate a checkpoint after training.
+  disable_ema: False
+  compute_generative_perplexity: False
+  perplexity_batch_size: 8
+  compute_perplexity_on_sanity: False
+  gen_ppl_eval_model_name_or_path: gpt2-large  # gpt2-large, meta-llama/Llama-2-7b-hf
+  generate_samples: True
+  generation_model: /workspace/sg666/MDpLM/checkpoints/membrane_automodel/epochs60_lr3e-4_200k-seqs_bsz16_all-params_no-compile_gradclip1_beta-one0.9_beta-two0.999_bf16/
+optim:
+  weight_decay: 0.075
+  lr: 3e-4
+  beta1: 0.9
+  beta2: 0.999
+  eps: 1e-8
+Model:
+  hidden_size: 1280
+  cond_dim: 256
+  n_heads: 20
+  n_blocks: 4
+  dropout: 0.5
+  length: null #512
+  scale_by_sigma: True
+trainer:
+  _target_: lightning.Trainer
+  accelerator: cuda
+  num_nodes: 1
+  devices: ${device_count:}
+  accumulate_grad_batches: ${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}}
+  gradient_clip_val: 1.0
+  precision: bf16
+  num_sanity_val_steps: 2
+  max_epochs: 60
+  max_steps: 1_000_000
+  log_every_n_steps: 10
+  limit_train_batches: 1.0   # train on full dataset, can be used to toggle quick run
+  limit_val_batches: 1.0     # validate on full dataset, can be used to toggle quick run
+  val_check_interval: 955
+wandb:
+  project: MDpLM_finetune_membrane_200k-seqs
+  notes: null
+  group: programmablebio
+  job_type: null
+  name: dit_test #dit_wrapping_epochs60_lr3e-4_200k-seqs_bsz16_all-params_no-compile_gradclip1_beta-one0.9_beta-two0.999_bf16
+  id: ${.name}_${seed}
+hydra:
+  run:
+    dir: /workspace/sg666/MDpLM/outputs/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S}
+  job:
+    chdir: true
+checkpointing:
+  # Use custom `save_dir` if, e.g., saving to S3 bucket, otherwise leave this parameter as is
+  save_dir: /workspace/sg666/MDpLM/checkpoints/membrane_mdlm/
+  # Note: `checkpoints` path should correspond to `checkpoint_every_n_steps.dirpath`
+  resume_from_ckpt: false
+  resume_ckpt_path: ${.save_dir}/epochs30_lr3e-4_bsz8_gradclip1_beta-one0.9_beta-two0.999_bf16_all-params_no-compile/checkpoints/last.ckpt #/checkpoints/last.ckpt
+  pretrained_esm_mdlm_automodel_path: /workspace/sg666/MDpLM/checkpoints/vanilla_esm_pretrained_automodel/epochs10_lr3e-4_200k-seqs_bsz16_all-params_no-compile_gradclip1_beta-one0.9_beta-two0.999_bf16/
+  finetuned_esm_mdlm_automodel_path: /workspace/sg666/MDpLM/checkpoints/membrane_mdlm/

diffusion.py ADDED Viewed

	@@ -0,0 +1,1434 @@

+import itertools
+import math
+import os
+import sys
+import typing
+from dataclasses import dataclass
+import hydra.utils
+import lightning as L
+import numpy as np
+import torch.nn as nn
+import torch
+# import dit
+import ema
+import time
+import gc
+import pl_data_loader as dataloader
+import torch.nn.functional as F
+import torchmetrics
+import transformers
+from torch import Tensor
+from torch.optim.lr_scheduler import _LRScheduler
+from transformers import AutoModelForMaskedLM, AutoModel, AutoTokenizer
+import utils
+import noise_schedule
+LOG2 = math.log(2)
+class CosineWarmup(_LRScheduler):
+    def __init__(self, optimizer, warmup_steps, total_steps, eta_ratio=0.1, last_epoch=-1):
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.eta_ratio = eta_ratio  # The ratio of minimum to maximum learning rate
+        super(CosineWarmup, self).__init__(optimizer, last_epoch)
+    def get_lr(self):
+        if self.last_epoch < self.warmup_steps:
+            return [base_lr * self.last_epoch / self.warmup_steps for base_lr in self.base_lrs]
+        progress = (self.last_epoch - self.warmup_steps) / (self.total_steps - self.warmup_steps)
+        cosine_decay = 0.5 * (1 + np.cos(np.pi * progress))
+        decayed_lr = (1 - self.eta_ratio) * cosine_decay + self.eta_ratio
+        return [decayed_lr * base_lr for base_lr in self.base_lrs]
+def _sample_categorical(categorical_probs):
+  gumbel_norm = (
+    1e-10
+    - (torch.rand_like(categorical_probs) + 1e-10).log())
+  return (categorical_probs / gumbel_norm).argmax(dim=-1)
+def _unsqueeze(x, reference):
+  return x.view(
+    * x.shape,
+    * ((1,) * (len(reference.shape) - len(x.shape))))
+@dataclass
+class Loss:
+  loss: torch.FloatTensor
+  nlls: torch.FloatTensor
+  token_mask: torch.FloatTensor
+class NLL(torchmetrics.aggregation.MeanMetric):
+  pass
+class BPD(NLL):
+  def compute(self) -> Tensor:
+    """Computes the bits per dimension.
+    Returns:
+      bpd
+    """
+    return self.mean_value / self.weight / LOG2
+class Perplexity(NLL):
+  def compute(self) -> Tensor:
+    """Computes the Perplexity.
+    Returns:
+     Perplexity
+    """
+    return torch.exp(self.mean_value / self.weight)
+class WrapVanillaESM(nn.Module):
+  def __init__(self, bert_model_path):
+    super(WrapVanillaESM, self).__init__()
+    #self.bert_model_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    #self.model = AutoModelForMaskedLM.from_pretrained(bert_model_path).to(self.bert_model_device)
+    self.model = AutoModelForMaskedLM.from_pretrained(bert_model_path, device_map='cpu')
+    self.tokenizer = AutoTokenizer.from_pretrained(bert_model_path)
+  def __call__(self, *args, **kwargs):
+    return self.model(*args, **kwargs)
+  def unfreeze_attn_layers(self):
+    model_layers = len(self.model.esm.encoder.layer)
+    for i, layer in enumerate(self.model.esm.encoder.layer):
+      if i >= model_layers-5: # fine-tune only last n layers
+        for module in layer.attention.self.key.modules():
+          for param in module.parameters():
+            param.requires_grad = True
+        for module in layer.attention.self.query.modules():
+          for param in module.parameters():
+            param.requires_grad = True
+        for module in layer.attention.self.value.modules():
+          for param in module.parameters():
+            param.requires_grad = True
+  def unfreeze_all_layers(self):
+    for param in self.model.parameters():
+      param.requires_grad = True
+  def forward(self, inputs, sigma, attention_mask):
+    logits = self.model(input_ids=inputs, attention_mask=attention_mask).logits
+    return logits
+  def save_model(self, save_dir):
+      self.model.save_pretrained(save_dir)
+      self.tokenizer.save_pretrained(save_dir)
+  def load_model(self, load_dir):
+      self.model = AutoModel.from_pretrained(load_dir)
+      self.tokenizer = AutoTokenizer.from_pretrained(load_dir)
+class WrapMembraneESM(nn.Module):
+  def __init__(self, bert_model_path):
+    super(WrapMembraneESM, self).__init__()
+    #self.bert_model_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    #self.model = AutoModelForMaskedLM.from_pretrained(bert_model_path).to(self.bert_model_device)
+    self.model = AutoModelForMaskedLM.from_pretrained(bert_model_path, device_map='cpu')
+    self.tokenizer = AutoTokenizer.from_pretrained(bert_model_path)
+  def __call__(self, *args, **kwargs):
+    return self.model(*args, **kwargs)
+  def freeze_model(self):
+    for param in self.model.parameters():
+      param.requires_grad = False
+  def unfreeze_all_layers(self):
+    for param in self.model.parameters():
+      param.requires_grad = True
+  def unfreeze_attn_layers(self):
+    model_layers = len(self.model.esm.encoder.layer)
+    for i, layer in enumerate(self.model.esm.encoder.layer):
+      if i >= model_layers-11: # fine-tune only last n layers
+        for module in layer.attention.self.key.modules():
+          for param in module.parameters():
+            param.requires_grad = True
+        for module in layer.attention.self.query.modules():
+          for param in module.parameters():
+            param.requires_grad = True
+        for module in layer.attention.self.value.modules():
+          for param in module.parameters():
+            param.requires_grad = True
+  def forward(self, inputs, sigma, attention_mask):
+    logits = self.model(input_ids=inputs, attention_mask=attention_mask).logits
+    return logits
+  def save_model(self, save_dir):
+      self.model.save_pretrained(save_dir)
+      self.tokenizer.save_pretrained(save_dir)
+  def load_model(self, load_dir):
+      self.model = AutoModel.from_pretrained(load_dir)
+      self.tokenizer = AutoTokenizer.from_pretrained(load_dir)
+class Diffusion(L.LightningModule):
+  def __init__(
+    self,
+    config,
+    tokenizer: transformers.PreTrainedTokenizer):
+    super().__init__()
+    self.save_hyperparameters()
+    self.config = config
+    self.tokenizer = tokenizer
+    self.vocab_size = self.tokenizer.vocab_size
+    self.sampler = self.config.sampling.predictor
+    self.gen_ppl_eval_model_name_or_path = self.config.eval.\
+      gen_ppl_eval_model_name_or_path
+    self.antithetic_sampling = self.config.training.antithetic_sampling
+    self.importance_sampling = self.config.training.importance_sampling
+    self.change_of_variables = self.config.training.change_of_variables
+    if (not hasattr(self.tokenizer, 'mask_token')
+        or self.tokenizer.mask_token is None):
+      self.mask_index = self.vocab_size
+      self.vocab_size += 1
+    else:
+      self.mask_index = self.tokenizer.mask_token_id
+    self.parameterization = self.config.parameterization
+    # if self.config.backbone == 'dit':
+    #   self.backbone = dit.DIT(
+    #     self.config, vocab_size=self.vocab_size, mlm_model_path=config.training.mlm_model_path)
+    if self.config.backbone == "vanilla_esm_pretrain":
+      self.backbone = WrapVanillaESM(bert_model_path=self.config.training.esm_model_path)
+      self.backbone.unfreeze_all_layers()
+      self.backbone = torch.compile(self.backbone)
+    elif self.config.backbone == 'membrane_esm_finetune':
+      self.backbone = WrapMembraneESM(bert_model_path=self.config.checkpointing.pretrained_esm_mdlm_automodel_path)
+      self.backbone.unfreeze_all_layers()
+      # self.backbone = torch.compile(self.backbone)
+    # elif self.config.backbone == 'dimamba':
+    #   self.backbone = dimamba.DiMamba(
+    #     self.config,
+    #     vocab_size=self.vocab_size,
+    #     pad_token_id=self.tokenizer.pad_token_id)
+    # elif self.config.backbone == 'ar':
+    #   self.backbone = autoregressive.AR(
+    #     self.config,
+    #     vocab_size=self.vocab_size,
+    #     mask_index=self.mask_index)
+    # elif self.config.backbone == 'hf_dit':
+    #   self.backbone = transformers.AutoModelForMaskedLM.from_pretrained(
+    #     config.eval.checkpoint_path, trust_remote_code=True)
+    # else:
+    #   raise ValueError(
+    #     f'Unknown backbone: {self.config.backbone}')
+    self.T = self.config.T
+    self.subs_masking = self.config.subs_masking
+    self.softplus = torch.nn.Softplus()
+    # metrics are automatically reset at end of epoch
+    metrics = torchmetrics.MetricCollection({
+      'nll': NLL(),
+      'bpd': BPD(),
+      'ppl': Perplexity(),
+    })
+    metrics.set_dtype(torch.float64)
+    self.train_metrics = metrics.clone(prefix='train/')
+    self.valid_metrics = metrics.clone(prefix='val/')
+    self.test_metrics = metrics.clone(prefix='test/')
+    # generative perplexity
+    self.gen_ppl_metric = Perplexity()
+    self.eval_model_tokenizer = transformers.AutoTokenizer.\
+      from_pretrained(self.gen_ppl_eval_model_name_or_path)
+    if self.eval_model_tokenizer.pad_token is None:
+      self.eval_model_tokenizer.pad_token =\
+          self.eval_model_tokenizer.eos_token
+      self.eval_model_tokenizer.pad_token_id =\
+          self.eval_model_tokenizer.eos_token_id
+    self.noise = noise_schedule.get_noise(self.config,
+                                          dtype=self.dtype)
+    if self.config.training.ema > 0:
+      self.ema = ema.ExponentialMovingAverage(
+        itertools.chain(self.backbone.parameters(),
+                        self.noise.parameters()),
+        decay=self.config.training.ema)
+    else:
+      self.ema = None
+    self.lr = self.config.optim.lr
+    self.sampling_eps = self.config.training.sampling_eps
+    self.time_conditioning = self.config.time_conditioning
+    self.neg_infinity = -1000000.0
+    self.fast_forward_epochs = None
+    self.fast_forward_batches = None
+    self._validate_configuration()
+  def _validate_configuration(self):
+    assert not (self.change_of_variables
+                and self.importance_sampling)
+    if self.parameterization == 'sedd':
+      assert not self.importance_sampling
+      assert not self.change_of_variables
+    if self.parameterization == 'd3pm':
+      assert self.T > 0
+    if self.T > 0:
+      assert self.parameterization in {'d3pm', 'subs'}
+    if self.subs_masking:
+      assert self.parameterization == 'd3pm'
+  def on_load_checkpoint(self, checkpoint):
+    if self.ema:
+      self.ema.load_state_dict(checkpoint['ema'])
+    # Copied from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py#L41
+    self.fast_forward_epochs = checkpoint['loops'][
+      'fit_loop']['epoch_progress']['current']['completed']
+    self.fast_forward_batches = checkpoint['loops'][
+      'fit_loop']['epoch_loop.batch_progress'][
+        'current']['completed']
+  def on_save_checkpoint(self, checkpoint):
+    if self.ema:
+      checkpoint['ema'] = self.ema.state_dict()
+    # Copied from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/tasks/seq.py
+    # ['epoch_loop.batch_progress']['total']['completed'] is 1 iteration
+    # behind, so we're using the optimizer's progress.
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.batch_progress']['total'][
+        'completed'] = checkpoint['loops']['fit_loop'][
+          'epoch_loop.automatic_optimization.optim_progress'][
+            'optimizer']['step']['total'][
+              'completed'] * self.trainer.accumulate_grad_batches
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.batch_progress']['current'][
+        'completed'] = checkpoint['loops']['fit_loop'][
+          'epoch_loop.automatic_optimization.optim_progress'][
+            'optimizer']['step']['current'][
+              'completed'] * self.trainer.accumulate_grad_batches
+    # _batches_that_stepped tracks the number of global steps, not the number
+    # of local steps, so we don't multiply with self.trainer.accumulate_grad_batches here.
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.state_dict'][
+        '_batches_that_stepped'] = checkpoint['loops']['fit_loop'][
+          'epoch_loop.automatic_optimization.optim_progress'][
+            'optimizer']['step']['total']['completed']
+    if 'sampler' not in checkpoint.keys():
+      checkpoint['sampler'] = {}
+    if hasattr(self.trainer.train_dataloader.sampler,
+               'state_dict'):
+      sampler_state_dict = self.trainer.\
+        train_dataloader.sampler.state_dict()
+      checkpoint['sampler'][
+        'random_state'] = sampler_state_dict.get(
+          'random_state', None)
+    else:
+      checkpoint['sampler']['random_state'] = None
+    self.backbone.save_model(self.config.checkpointing.fine_tuned_esm_mdlm_ckpt_path)
+  def on_train_start(self):
+    torch.cuda.empty_cache()
+    if self.ema:
+      self.ema.move_shadow_params_to_device(self.device)
+    # Adapted from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
+    distributed = (
+      self.trainer._accelerator_connector.use_distributed_sampler
+      and self.trainer._accelerator_connector.is_distributed)
+    if distributed:
+      sampler_cls = dataloader.FaultTolerantDistributedSampler
+    else:
+      sampler_cls = dataloader.RandomFaultTolerantSampler
+    updated_dls = []
+    for dl in self.trainer.fit_loop._combined_loader.flattened:
+      if hasattr(dl.sampler, 'shuffle'):
+        dl_sampler = sampler_cls(
+          dl.dataset, shuffle=dl.sampler.shuffle)
+      else:
+        dl_sampler = sampler_cls(dl.dataset)
+      if (distributed
+          and self.fast_forward_epochs is not None
+          and self.fast_forward_batches is not None):
+        dl_sampler.load_state_dict({
+          'epoch': self.fast_forward_epochs,
+          'counter': (self.fast_forward_batches
+                      * self.config.loader.batch_size)})
+      from functools import partial
+      from pl_data_loader import collate_fn
+      collate_partial = partial(collate_fn, tokenizer=self.tokenizer)
+      torch.cuda.empty_cache()
+      updated_dls.append(
+        torch.utils.data.DataLoader(
+          dl.dataset,
+          batch_size=self.config.loader.batch_size,
+          num_workers=self.config.loader.num_workers,
+          pin_memory=self.config.loader.pin_memory,
+          sampler=dl_sampler,
+          shuffle=False,
+          persistent_workers=False,
+          collate_fn=collate_partial))
+    self.trainer.fit_loop._combined_loader.flattened = updated_dls
+  def optimizer_step(self, *args, **kwargs):
+    super().optimizer_step(*args, **kwargs)
+    gc.collect()
+    torch.cuda.empty_cache()
+    if self.ema:
+      self.ema.update(itertools.chain(
+        self.backbone.parameters(),
+        self.noise.parameters()))
+    # optimizer_closure = kwargs.get('optimizer_closure', None)
+    # params_with_grad = [p for p in itertools.chain(
+    #   self.backbone.parameters(),
+    #   self.noise.parameters()
+    # ) if p.requires_grad and p.grad_fn is not None]
+    # # if params_with_grad:
+    # #   super().optimizer_step(closure=optimizer_closure)
+    # if self.ema:
+    #   self.ema.update(params_with_grad)
+    # super().optimizer_step(*args, **kwargs)
+  def _subs_parameterization(self, logits, xt):
+    # log prob at the mask index = - infinity
+    logits = logits.logits
+    logits[:, :, self.mask_index] += self.neg_infinity
+    # logits[:, :, self.tokenizer.eos_token_id] += self.neg_infinity
+    # logits[:, :, self.tokenizer.cls_token_id] += self.neg_infinity
+    # Normalize the logits such that x.exp() is
+    # a probability distribution over vocab_size.
+    logits = logits - torch.logsumexp(logits, dim=-1,
+                                      keepdim=True)
+    # Apply updates directly in the logits matrix.
+    # For the logits of the unmasked tokens, set all values
+    # to -infinity except for the indices corresponding to
+    # the unmasked tokens.
+    unmasked_indices = (xt != self.mask_index)
+    logits[unmasked_indices] = self.neg_infinity
+    logits[unmasked_indices, xt[unmasked_indices]] = 0
+    return logits
+  def _d3pm_parameterization(self, logits):
+    if self.subs_masking:
+      logits[:, :, self.mask_index] += self.neg_infinity
+    logits = logits - torch.logsumexp(logits, dim=-1,
+                                      keepdim=True)
+    return logits
+  def _sedd_parameterization(self, logits, xt, sigma):
+    esigm1_log = torch.where(
+      sigma < 0.5,
+      torch.expm1(sigma),
+      sigma.exp() - 1).log().to(logits.dtype)
+    # logits shape
+    # (batch_size, diffusion_model_input_length, vocab_size)
+    logits = logits - esigm1_log[:, None, None] - np.log(
+      logits.shape[-1] - 1)
+    # The below scatter operation sets the log score
+    # for the input word to 0.
+    logits = torch.scatter(logits, -1, xt[..., None],
+                           torch.zeros_like(logits[..., :1]))
+    return logits
+  def _process_sigma(self, sigma):
+    if sigma is None:
+      assert self.parameterization == 'ar'
+      return sigma
+    if sigma.ndim > 1:
+      sigma = sigma.squeeze(-1)
+    if not self.time_conditioning:
+      sigma = torch.zeros_like(sigma)
+    assert sigma.ndim == 1, sigma.shape
+    return sigma
+  def forward(self, x, sigma, attention_mask, print_logits=False):
+    """Returns log score."""
+    sigma = self._process_sigma(sigma)
+    with torch.amp.autocast("cuda", dtype=torch.float32):
+      logits = self.backbone(x, attention_mask)
+    # if print_logits:
+      # torch.set_printoptions(profile="full")
+      # print(logits)
+      # torch.set_printoptions(profile="default")
+    if self.parameterization == 'subs':
+      return self._subs_parameterization(logits=logits, xt=x)
+    return logits
+  def _d3pm_loss(self, model_output, xt, x0, t, attention_mask):
+    dt = 1 / self.T
+    if torch.is_tensor(t):
+      t = t[:, None]
+      assert t.ndim == 2
+      t = t.clamp(0., 1. - 1e-4)
+    alpha_t = 1 - t + torch.zeros_like(xt)
+    alpha_s = 1 - (t - dt) + torch.zeros_like(xt)
+    log_x_theta_at_x0 = torch.gather(
+      model_output, -1, x0[:, :, None]).squeeze(-1)
+    log_x_theta_at_m = model_output[:, :, self.mask_index]
+    x_theta_at_m = log_x_theta_at_m.exp()
+    term_1_coef = dt / t
+    term_1_log_nr = torch.log(alpha_t * x_theta_at_m / t + 1)
+    term_1_log_dr = log_x_theta_at_x0
+    term_2_coef = 1 - dt / t
+    term_2_log_nr = term_1_log_nr
+    term_2_log_dr = torch.log(alpha_s * x_theta_at_m / (t - dt) + 1)
+    L_vb_masked = (
+      term_1_coef * (term_1_log_nr - term_1_log_dr)
+      + term_2_coef * (term_2_log_nr - term_2_log_dr))
+    L_vb = L_vb_masked * (xt == self.mask_index)
+    return self.T * L_vb
+  def _compute_loss(self, batch, prefix):
+    if 'attention_mask' in batch:
+      attention_mask = batch['attention_mask']
+    else:
+      attention_mask = None
+    if 'mask' in batch: mask = batch['mask']
+    else: mask = None
+    losses = self._loss(batch['input_ids'], attention_mask, mask)
+    loss = losses.loss
+    if prefix == 'train':
+      self.train_metrics.update(losses.nlls, losses.token_mask)
+      metrics = self.train_metrics
+    elif prefix == 'val':
+      self.valid_metrics.update(losses.nlls, losses.token_mask)
+      metrics = self.valid_metrics
+    elif prefix == 'test':
+      self.test_metrics.update(losses.nlls, losses.token_mask)
+      metrics = self.test_metrics
+    else:
+      raise ValueError(f'Invalid prefix: {prefix}')
+    self.log_dict(metrics,
+                  on_step=False,
+                  on_epoch=True,
+                  sync_dist=True)
+    return loss
+  def on_train_epoch_start(self):
+    self.backbone.train()
+    self.noise.train()
+  def training_step(self, batch, batch_idx):
+    # Initialize throughput calculation
+    start_time = time.time()
+    loss = self._compute_loss(batch, prefix='train')
+    self.log(name='trainer/loss',
+             value=loss.item(),
+             on_step=True,
+             on_epoch=False,
+             sync_dist=True)
+    # Calculate throughput
+    elapsed_time = time.time() - start_time
+    total_tokens = batch['input_ids'].numel()
+    throughput = total_tokens / elapsed_time
+    self.log(name='trainer/throughput',
+             value=throughput,
+             on_step=True,
+             on_epoch=False,
+             sync_dist=True)
+    return loss
+  def on_validation_epoch_start(self):
+    # params_with_grad = [p for p in itertools.chain(
+    #   self.backbone.parameters(),
+    #   self.noise.parameters()
+    # ) if p.requires_grad]
+    # if self.ema:
+    #   self.ema.store(params_with_grad)
+    #   self.ema.copy_to(params_with_grad)
+    gc.collect()
+    torch.cuda.empty_cache()
+    if self.ema:
+      self.ema.store(
+        itertools.chain(
+          self.backbone.parameters(),
+          self.noise.parameters()))
+      self.ema.copy_to(itertools.chain(
+        self.backbone.parameters(),
+        self.noise.parameters()))
+    self.backbone.eval()
+    self.noise.eval()
+    assert self.valid_metrics.nll.mean_value == 0
+    assert self.valid_metrics.nll.weight == 0
+  def validation_step(self, batch, batch_idx):
+    loss = self._compute_loss(batch, prefix='val')
+    self.log(name='trainer/val_loss',
+             value=loss.item(),
+             on_step=True,
+             on_epoch=False,
+             prog_bar=True,
+             sync_dist=True)
+    return loss
+  def on_validation_epoch_end(self):
+    # params_with_grad = [p for p in itertools.chain(
+    #   self.backbone.parameters(),
+    #   self.noise.parameters()
+    # ) if p.requires_grad]
+    # if ((self.config.eval.compute_perplexity_on_sanity
+    #      or not self.trainer.sanity_checking)
+    #      and self.config.eval.generate_samples
+    #      and not self.parameterization == 'ar'):
+    #   # (justin): implement sampling and kv cache for AR
+    #   samples, text_samples = None, None
+    #   for _ in range(
+    #     self.config.sampling.num_sample_batches):
+    #     samples = self._sample()
+    #     # Decode the samples to be re-tokenized by eval model
+    #     text_samples = self.tokenizer.batch_decode(samples)
+    #     if self.config.eval.compute_generative_perplexity:
+    #       self.compute_generative_perplexity(text_samples)
+    #   if self.trainer.global_rank == 0 and hasattr(
+    #     self.trainer.logger, 'log_table'):
+    #     # Log the last generated samples
+    #     text_samples = text_samples[
+    #       : self.config.sampling.num_sample_log]
+    #     self.trainer.logger.log_table(
+    #       key=f'samples@global_step{self.global_step}',
+    #       columns=['Generated Samples'],
+    #       data=[[s] for s in text_samples])
+    #   if self.config.eval.compute_generative_perplexity:
+    #     self.log('val/gen_ppl',
+    #              self.gen_ppl_metric,
+    #              on_epoch=True,
+    #              on_step=False,
+    #              sync_dist=True)
+    gc.collect()
+    torch.cuda.empty_cache()
+    if self.ema:
+      self.ema.restore(
+        itertools.chain(
+          self.backbone.parameters(),
+          self.noise.parameters()))
+  def test_step(self, batch, batch_idx):
+    loss = self._compute_loss(batch, prefix='test')
+    self.log('test/loss',
+             value=loss.item(),
+             on_step=False,
+             on_epoch=True,
+             sync_dist=True)
+    if self.config.eval.compute_generative_perplexity:
+      samples, text_samples = None, None
+      for _ in range(
+        self.config.sampling.num_sample_batches):
+        samples = self._sample()
+        # Decode the samples to be re-tokenized by eval model
+        text_samples = self.tokenizer.batch_decode(samples)
+        if self.config.eval.compute_generative_perplexity:
+          self.compute_generative_perplexity(text_samples)
+      if self.trainer.global_rank == 0 and hasattr(
+        self.trainer.logger, 'log_table'):
+        # Log the last generated samples
+        text_samples = text_samples[
+          : self.config.sampling.num_sample_log]
+        self.trainer.logger.log_table(
+          key=f'samples@global_step{self.global_step}',
+          columns=['Generated Samples'],
+          data=[[s] for s in text_samples])
+      if self.config.eval.compute_generative_perplexity:
+        self.log('test/gen_ppl',
+                 self.gen_ppl_metric,
+                 on_epoch=False,
+                 on_step=True,
+                 sync_dist=True)
+  def on_test_epoch_start(self):
+    # params_with_grad = [p for p in itertools.chain(
+    #   self.backbone.parameters(),
+    #   self.noise.parameters()
+    # ) if p.requires_grad]
+    if self.ema:
+      self.ema.store(itertools.chain(
+        self.backbone.parameters(),
+        self.noise.parameters()))
+      self.ema.copy_to(itertools.chain(
+        self.backbone.parameters(),
+        self.noise.parameters()))
+    self.backbone.eval()
+    self.noise.eval()
+    self.test_metrics.reset()
+  def on_test_epoch_end(self):
+    # params_with_grad = [p for p in itertools.chain(
+    #   self.backbone.parameters(),
+    #   self.noise.parameters()
+    # ) if p.requires_grad]
+    if self.ema:
+      self.ema.restore(itertools.chain(
+        self.backbone.parameters(),
+        self.noise.parameters()))
+    for metric_name, metric_value in self.test_metrics.compute().items():
+      self.log(metric_name, metric_value, sync_dist=True)
+  def configure_optimizers(self):
+    # (yair): Lightning currently giving this warning when using `fp16`:
+    #  "Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+    #  Not clear if this is a problem or not.
+    #  See: https://github.com/Lightning-AI/pytorch-lightning/issues/5558
+    # params_with_grad = [p for p in itertools.chain(
+    #   self.backbone.parameters(),
+    #   self.noise.parameters()
+    # ) if p.requires_grad]
+    optimizer = torch.optim.AdamW(
+        itertools.chain(self.backbone.parameters(),
+                        self.noise.parameters()),
+        lr=self.config.optim.lr,
+        betas=(self.config.optim.beta1,
+               self.config.optim.beta2),
+        eps=self.config.optim.eps,
+        weight_decay=self.config.optim.weight_decay
+    )
+    # scheduler = hydra.utils.instantiate(
+    #   self.config.lr_scheduler, optimizer=optimizer)
+    # scheduler_dict = {
+    #   'scheduler': scheduler,
+    #   'interval': 'step',
+    #   'monitor': 'val/loss',
+    #   'name': 'trainer/lr',
+    # }
+    self.total_steps = self.config.trainer.max_steps
+    scheduler = CosineWarmup(optimizer,
+                             warmup_steps=self.config.lr_scheduler.num_warmup_steps,
+                             total_steps=self.total_steps)
+    scheduler_dict = {
+      'scheduler': scheduler,
+      'interval': 'step',
+      'frequency': 1,
+      'monitor': 'val/loss',
+      'name': 'trainer/lr'
+    }
+    return [optimizer], [scheduler_dict]
+  @torch.no_grad()
+  def eval_retokenize(self, text_samples, max_length):
+    """Retokenizes samples for the eval model.
+    Args:
+        text_samples: List of sentences generated by the model.
+    Returns:
+        samples: Samples re-tokenized for the eval model
+        attn_mask: Attention mask for the eval model
+        eval_context_size: Size of the context for the eval model
+    """
+    if 'llama2' in self.gen_ppl_eval_model_name_or_path:
+      tokenizer_kwargs = {
+        'text_samples': text_samples,
+        'return_tensors': 'pt',
+        'return_token_type_ids': False,
+        'return_attention_mask': True,
+        'truncation': True,
+        'padding': True,
+        'max_length': max_length,
+      }
+      eval_context_size = 4096
+    else:
+      tokenizer_kwargs = {
+        'return_tensors': 'pt',
+        'return_token_type_ids': False,
+        'return_attention_mask': True,
+        'truncation': True,
+        'padding': True,
+        'max_length': max_length,
+      }
+      eval_context_size = 1024
+    samples = self.eval_model_tokenizer(
+      text_samples, ** tokenizer_kwargs)
+    attn_mask = samples['attention_mask']
+    samples = samples['input_ids']
+    if 'llama2' not in self.gen_ppl_eval_model_name_or_path:
+      attn_mask = attn_mask.to(self.device)
+      samples = samples.to(self.device)
+    return samples, attn_mask, eval_context_size
+  # @torch.no_grad()
+  # def compute_generative_perplexity(
+  #   self,
+  #   text_samples: typing.List[str],
+  #   retokenize: bool = True,
+  #   max_length: typing.Optional[int] = None) -> None:
+  #   """Compute the generative perplexity of the model.
+  #   Args:
+  #       text_samples: List of sentences generated by the model.
+  #   Returns:
+  #       Perplexity of the generated text under a different
+  #       pre-trained AR model (e.g., GPT2).
+  #   """
+  #   os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+  #   eval_model = transformers.AutoModelForCausalLM.from_pretrained(
+  #     self.gen_ppl_eval_model_name_or_path).eval()
+  #   if max_length is None:
+  #     max_length = self.config.model.length
+  #   if 'llama2' not in self.gen_ppl_eval_model_name_or_path:
+  #     eval_model = eval_model.to(self.device)
+  #   # Re-tokenize using eval model's tokenizer
+  #   if retokenize:
+  #     (samples, attn_mask,
+  #      eval_context_size) = self.eval_retokenize(
+  #        text_samples, max_length=max_length)
+  #   else:
+  #     samples = text_samples
+  #     attn_mask = torch.ones(samples.shape).to(self.device)
+  #     eval_context_size = samples.shape[-1]
+  #   batch_size = min(
+  #     self.config.eval.perplexity_batch_size,
+  #     samples.shape[0])
+  #   num_batches = samples.shape[0] // batch_size
+  #   for i in range(num_batches):
+  #     _samples = torch.split(
+  #       samples[i * batch_size: (i + 1) * batch_size],
+  #       eval_context_size,
+  #       dim=-1)
+  #     _attn_mask = torch.split(
+  #       attn_mask[i * batch_size: (i + 1) * batch_size],
+  #       eval_context_size,
+  #       dim=-1)
+  #     for (sample_chunk, attn_mask_chunk) in zip(
+  #       _samples, _attn_mask):
+  #       logits = eval_model(
+  #         sample_chunk, attention_mask=attn_mask_chunk)[0]
+  #       logits = logits.transpose(-1, -2)
+  #       nlls = F.cross_entropy(logits[..., :-1],
+  #                              sample_chunk[..., 1:],
+  #                              reduction='none')
+  #       first_eos = (sample_chunk == self.eval_model_tokenizer\
+  #                    .eos_token_id).cumsum(-1) == 1
+  #       token_mask = (
+  #         sample_chunk
+  #         != self.eval_model_tokenizer.eos_token_id)
+  #       self.gen_ppl_metric.update(
+  #         nlls, first_eos[..., 1:] + token_mask[..., 1:])
+  @torch.no_grad()
+  def compute_masked_perplexity(self, sequences, masked):
+    """Compute the pseudo-perplexity of the generated protein sequences."""
+    total_nll = 0
+    total_tokens = 0
+    for sequence in sequences:
+        # Tokenize the sequence
+        input_ids = self.tokenizer(masked, return_tensors="pt").input_ids.to(self.device)
+        gt_ids = self.tokenizer(sequence.upper(), return_tensors="pt").input_ids.to(self.device)
+        # print(input_ids.shape)
+        # print(gt_ids.shape)
+        # Forward pass through the ESM model
+        attention_mask = torch.ones_like(input_ids)
+        if self.config.mode in ['train', 'ppl_eval']:
+          outputs = self.backbone.model.forward(input_ids=input_ids, attention_mask=attention_mask)
+        elif self.config.mode == "sample_eval":
+          outputs = self.backbone.model.forward(input_ids)
+        logits = outputs[-1] # B, L, V
+        # Compute loss
+        # shift_logits = logits[:, :-1, :].contiguous() # remove eos
+        # shift_labels = input_ids[:, 1:].contiguous()
+        # print(masked)
+        # print(gt_ids.where(input_ids==32, torch.full_like(input_ids, -100)).view(-1))
+        loss = F.cross_entropy(logits.view(-1, logits.size(-1)),
+                              gt_ids.where(input_ids==32, torch.full_like(input_ids, -100)).view(-1),
+                              reduction='sum')
+        total_nll += loss.item()
+        #total_tokens += (input_ids != self.tokenizer.pad_token_id).sum().item() - 1  # -1 for the first token
+        total_tokens += input_ids.ne(self.tokenizer.pad_token_id).sum().item() # count in bos and eos
+    # Compute pseudo-perplexity
+    # print(total_nll, ",;,", total_tokens)
+    pseudo_perplexity = torch.exp(torch.tensor(total_nll / total_tokens))
+    self.gen_ppl_metric.update(pseudo_perplexity)
+    return pseudo_perplexity.item()
+  @torch.no_grad()
+  def compute_generative_perplexity(
+    self,
+    text_samples: typing.List[str],
+    retokenize: bool = True,
+    max_length: typing.Optional[int] = None) -> None:
+    """Compute the generative perplexity of the model.
+    Args:
+        text_samples: List of sentences generated by the model.
+    Returns:
+        Perplexity of the generated text under a different
+        pre-trained AR model (e.g., GPT2).
+    """
+    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+    eval_model = transformers.AutoModelForCausalLM.from_pretrained(
+      self.gen_ppl_eval_model_name_or_path).eval()
+    if max_length is None:
+      max_length = self.config.model.length
+    if 'llama2' not in self.gen_ppl_eval_model_name_or_path:
+      eval_model = eval_model.to(self.device)
+    # Re-tokenize using eval model's tokenizer
+    if retokenize:
+      (samples, attn_mask,
+       eval_context_size) = self.eval_retokenize(
+         text_samples, max_length=max_length)
+    else:
+      samples = text_samples
+      attn_mask = torch.ones(samples.shape).to(self.device)
+      eval_context_size = samples.shape[-1]
+    batch_size = min(
+      self.config.eval.perplexity_batch_size,
+      samples.shape[0])
+    num_batches = samples.shape[0] // batch_size
+    for i in range(num_batches):
+      _samples = torch.split(
+        samples[i * batch_size: (i + 1) * batch_size],
+        eval_context_size,
+        dim=-1)
+      _attn_mask = torch.split(
+        attn_mask[i * batch_size: (i + 1) * batch_size],
+        eval_context_size,
+        dim=-1)
+      for (sample_chunk, attn_mask_chunk) in zip(
+        _samples, _attn_mask):
+        logits = eval_model(
+          sample_chunk, attention_mask=attn_mask_chunk)[0]
+        logits = logits.transpose(-1, -2)
+        nlls = F.cross_entropy(logits[..., :-1],
+                               sample_chunk[..., 1:],
+                               reduction='none')
+        first_eos = (sample_chunk == self.eval_model_tokenizer\
+                     .eos_token_id).cumsum(-1) == 1
+        token_mask = (
+          sample_chunk
+          != self.eval_model_tokenizer.eos_token_id)
+        self.gen_ppl_metric.update(
+          nlls, first_eos[..., 1:] + token_mask[..., 1:])
+  def q_xt(self, x, move_chance):
+    """Computes the noisy sample xt.
+    Args:
+      x: int torch.Tensor with shape (batch_size,
+          diffusion_model_input_length), input.
+      move_chance: float torch.Tensor with shape (batch_size, 1).
+    """
+    actual_seq_length = (x != 1).sum(dim=1, keepdim=True)
+    max_mask_length = (actual_seq_length * 0.75).long()
+    move_indices = torch.rand(*x.shape, device=x.device) < move_chance
+    restricted_move_indices = torch.zeros_like(move_indices, dtype=torch.bool)
+    for i in range(x.shape[0]):
+      true_positions = torch.where(move_indices[i])[0]
+      if len(true_positions) > max_mask_length[i]:
+        selected_positions = true_positions[:max_mask_length[i].item()]
+        restricted_move_indices[i, selected_positions] = True
+      else:
+        restricted_move_indices[i] = move_indices[i]
+    xt = torch.where(restricted_move_indices, self.mask_index, x)
+    return xt
+  def _sample_prior(self, *batch_dims):
+    return self.mask_index * torch.ones(* batch_dims, dtype=torch.int64)
+  def _ddpm_caching_update(self, x, t, dt, p_x0=None, attention_mask=None):
+    assert self.config.noise.type == 'loglinear'
+    sigma_t, _ = self.noise(t)
+    if t.ndim > 1:
+      t = t.squeeze(-1)
+    assert t.ndim == 1
+    move_chance_t = t[:, None, None]
+    move_chance_s = (t - dt)[:, None, None]
+    assert move_chance_t.ndim == 3, move_chance_t.shape
+    if p_x0 is None:
+      p_x0 = self.forward(x, sigma_t, attention_mask).exp()
+    assert move_chance_t.ndim == p_x0.ndim
+    q_xs = p_x0 * (move_chance_t - move_chance_s)
+    q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+    _x = _sample_categorical(q_xs)
+    copy_flag = (x != self.mask_index).to(x.dtype)
+    return p_x0, copy_flag * x + (1 - copy_flag) * _x
+  def _ddpm_update(self, x, t, dt, attention_mask):
+    sigma_t, _ = self.noise(t)
+    sigma_s, _ = self.noise(t - dt)
+    if sigma_t.ndim > 1:
+      sigma_t = sigma_t.squeeze(-1)
+    if sigma_s.ndim > 1:
+      sigma_s = sigma_s.squeeze(-1)
+    assert sigma_t.ndim == 1, sigma_t.shape
+    assert sigma_s.ndim == 1, sigma_s.shape
+    move_chance_t = 1 - torch.exp(-sigma_t)
+    move_chance_s = 1 - torch.exp(-sigma_s)
+    move_chance_t = move_chance_t[:, None, None]
+    move_chance_s = move_chance_s[:, None, None]
+    unet_conditioning = sigma_t
+    log_p_x0 = self.forward(x, unet_conditioning, attention_mask)
+    assert move_chance_t.ndim == log_p_x0.ndim
+    # Technically, this isn't q_xs since there's a division
+    # term that is missing. This division term doesn't affect
+    # the samples.
+    q_xs = log_p_x0.exp() * (move_chance_t
+                             - move_chance_s)
+    q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
+    _x = _sample_categorical(q_xs)
+    copy_flag = (x != self.mask_index).to(x.dtype)
+    return copy_flag * x + (1 - copy_flag) * _x
+  def _ar_sampler(self, bsz):
+    # precompute token buffer
+    num_pred_tokens = self.config.model.length - 1
+    x = torch.zeros(
+      (bsz, num_pred_tokens + 1),
+      dtype=torch.long,
+      device=self.device)
+    x[:, 0] = self.tokenizer.bos_token_id
+    # precompute noise
+    noise = (torch.distributions.Gumbel(0, 1)
+             .sample((bsz, num_pred_tokens, self.vocab_size))
+             .to(self.device))
+    for i in range(num_pred_tokens):
+      next_logits = self.forward(x[:, :i + 1], None)[:, -1]
+      y = (next_logits + noise[:, i]).argmax(-1)
+      x[:, i + 1] = y
+    return x
+  @torch.no_grad()
+  def _sample(self, num_steps=None, eps=1e-5, x_input = None):
+    """Generate samples from the model."""
+    batch_size_per_gpu = self.config.eval.perplexity_batch_size
+    if self.parameterization == 'ar':
+      return self._ar_sampler(batch_size_per_gpu)
+    # Lightning auto-casting is not working in this method for some reason
+    if num_steps is None:
+      num_steps = self.config.sampling.steps
+    if x_input is not None:
+      x = x_input.input_ids
+      attention_mask = x_input.attention_mask
+    else:
+      x = self._sample_prior(batch_size_per_gpu, self.config.model.length).to(self.device)
+      attention_mask = torch.ones_like(x)
+    timesteps = torch.linspace(1, eps, num_steps + 1, device=self.device)
+    dt = (1 - eps) / num_steps
+    p_x0_cache = None
+    for i in range(num_steps):
+      t = timesteps[i] * torch.ones(x.shape[0], 1, device=self.device)
+      if self.sampler == 'ddpm':
+        x = self._ddpm_update(x, t, dt)
+      elif self.sampler == 'ddpm_cache':
+        p_x0_cache, x_next = self._ddpm_caching_update(x, t, dt, p_x0=p_x0_cache, attention_mask=attention_mask)
+        if (not torch.allclose(x_next, x) or self.time_conditioning):
+          # Disable caching
+          p_x0_cache = None
+        x = x_next
+        # print(self.tokenizer.decode(x.squeeze()))
+      else:
+        x = self._analytic_update(x, t, dt, attention_mask)
+    if self.config.sampling.noise_removal:
+      t = timesteps[-1] * torch.ones(x.shape[0], 1,
+                                     device=self.device)
+      if self.sampler == 'analytic':
+        x = self._denoiser_update(x, t)
+      else:
+        unet_conditioning = self.noise(t)[0]
+        x = self.forward(x, unet_conditioning, attention_mask, print_logits=True).argmax(dim=-1)
+        # print(self.tokenizer.decode(x.squeeze()))
+    return x
+  def restore_model_and_sample(self, num_steps, eps=1e-5):
+    """Generate samples from the model."""
+    # Lightning auto-casting is not working in this method for some reason
+    # params_with_grad = [p for p in itertools.chain(
+    #   self.backbone.parameters(),
+    #   self.noise.parameters()
+    # ) if p.requires_grad]
+    if self.ema:
+      self.ema.store(itertools.chain(self.backbone.parameters(),
+                                     self.noise.parameters()))
+      self.ema.copy_to(itertools.chain(self.backbone.parameters(),
+                                       self.noise.parameters()))
+    self.backbone.eval()
+    self.noise.eval()
+    samples = self._sample(num_steps=num_steps, eps=eps)
+    if self.ema:
+      self.ema.restore(itertools.chain(self.backbone.parameters(),
+                                       self.noise.parameters()))
+    self.backbone.train()
+    self.noise.train()
+    return samples
+  def get_score(self, x, sigma, attention_mask=None):
+    model_output = self.forward(x, sigma, attention_mask)
+    if self.parameterization == 'subs':
+      # score(x, t) = p_t(y) / p_t(x)
+      # => log score(x, t) = log p_t(y) - log p_t(x)
+      # case 1: x = masked
+      #   (i) y = unmasked
+      #     log score(x, t) = log p_\theta(x)|_y + log k
+      #     where k = exp(- sigma) / (1 - exp(- sigma))
+      #   (ii) y = masked
+      #     log score(x, t) = 0
+      # case 2: x = unmasked
+      #   (i) y != masked, y != x
+      #     log score(x_i, t) = - inf
+      #   (ii) y = x
+      #     log score(x_i, t) = 0
+      #   (iii) y = masked token
+      #     log score(x_i, t) = - log k
+      #     where k = exp(- sigma) / (1 - exp(- sigma))
+      log_k = - torch.log(torch.expm1(sigma)).squeeze(-1)
+      assert log_k.ndim == 1
+      masked_score = model_output + log_k[:, None, None]
+      masked_score[:, :, self.mask_index] = 0
+      unmasked_score = self.neg_infinity * torch.ones_like(
+        model_output)
+      unmasked_score = torch.scatter(
+        unmasked_score,
+        -1,
+        x[..., None],
+        torch.zeros_like(unmasked_score[..., :1]))
+      unmasked_score[:, :, self.mask_index] = - (
+        log_k[:, None] * torch.ones_like(x))
+      masked_indices = (x == self.mask_index).to(
+        model_output.dtype)[:, :, None]
+      model_output = (
+        masked_score * masked_indices
+        + unmasked_score * (1 - masked_indices))
+    return model_output.exp()
+  def _staggered_score(self, score, dsigma):
+    score = score.clone()
+    extra_const = (1 - dsigma.exp()) * score.sum(dim=-1)
+    score *= dsigma.exp()[:, None]
+    score[..., self.mask_index] += extra_const
+    return score
+  def _analytic_update(self, x, t, step_size, attention_mask=None):
+    curr_sigma, _ = self.noise(t)
+    next_sigma, _ = self.noise(t - step_size)
+    dsigma = curr_sigma - next_sigma
+    score = self.get_score(x, curr_sigma, attention_mask)
+    stag_score = self._staggered_score(score, dsigma)
+    probs = stag_score * self._transp_transition(x, dsigma)
+    return _sample_categorical(probs)
+  def _denoiser_update(self, x, t):
+    sigma, _ = self.noise(t)
+    score = self.get_score(x, sigma)
+    stag_score = self._staggered_score(score, sigma)
+    probs = stag_score * self._transp_transition(x, sigma)
+    probs[..., self.mask_index] = 0
+    samples = _sample_categorical(probs)
+    return samples
+  def _transp_transition(self, i, sigma):
+    sigma = _unsqueeze(sigma, reference=i[..., None])
+    edge = torch.exp(-sigma) * F.one_hot(
+      i, num_classes=self.vocab_size)
+    edge += torch.where(i == self.mask_index,
+                        1 - torch.exp(-sigma).squeeze(-1),
+                        0)[..., None]
+    return edge
+  def _sample_t(self, n, device):
+    _eps_t = torch.rand(n, device=device)
+    if self.antithetic_sampling:
+      offset = torch.arange(n, device=device) / n
+      _eps_t = (_eps_t / n + offset) % 1
+    t = (1 - self.sampling_eps) * _eps_t + self.sampling_eps
+    if self.importance_sampling:
+      return self.noise.importance_sampling_transformation(t)
+    return t
+  def _maybe_sub_sample(self, x0, attention_mask):
+    # seqlen = x0.shape[1]
+    # if seqlen > self.config.model.length:
+    #   assert seqlen == 2 * self.config.model.length
+    #   # cropping is needed for text8-crop dataset
+    #   # try the same starting point for now
+    #   start = np.random.choice(self.config.model.length)
+    #   end = start + self.config.model.length
+    #   input_tokens = x0[:, start: end]
+    #   output_tokens = x0[:, start + 1: end + 1]
+    #   new_attention_mask = attention_mask[:, start: end]
+    #   # Helps with validation PPL, since the val
+    #   # examples will all start and end with BOS/EOS
+    #   input_tokens[:, 0] = self.tokenizer.bos_token_id
+    #   output_tokens[:, -1] = self.tokenizer.eos_token_id
+    # elif self.parameterization == 'ar':
+    #   input_tokens = x0[:, :-1]
+    #   output_tokens = x0[:, 1:]
+    #   new_attention_mask = attention_mask[:, 1:]
+    # else:
+    input_tokens = x0
+    output_tokens = None
+    new_attention_mask = attention_mask
+    return input_tokens, output_tokens, new_attention_mask
+  def _reconstruction_loss(self, x0, attention_mask):
+    t0 = torch.zeros(x0.shape[0], dtype=self.dtype,
+                     device=self.device)
+    assert self.config.noise.type == 'loglinear'
+    # The above assert is for d3pm parameterization
+    unet_conditioning = self.noise(t0)[0][:, None]
+    model_output_t0 = self.forward(x0, unet_conditioning, attention_mask)
+    return - torch.gather(input=model_output_t0,
+                          dim=-1,
+                          index=x0[:, :, None]).squeeze(-1)
+  def _forward_pass_diffusion(self, x0, attention_mask, mask=None):
+    t = self._sample_t(x0.shape[0], x0.device)
+    if self.T > 0:
+      t = (t * self.T).to(torch.int)
+      t = t / self.T
+      # t \in {1/T, 2/T, ..., 1}
+      t += (1 / self.T)
+    if self.change_of_variables:
+      unet_conditioning = t[:, None]
+      f_T = torch.log1p(- torch.exp(- self.noise.sigma_max))
+      f_0 = torch.log1p(- torch.exp(- self.noise.sigma_min))
+      move_chance = torch.exp(f_0 + t * (f_T - f_0))
+      move_chance = move_chance[:, None]
+    else:
+      sigma, dsigma = self.noise(t)
+      unet_conditioning = sigma[:, None]
+      move_chance = 1 - torch.exp(-sigma[:, None])
+    if mask is None: xt = self.q_xt(x0, move_chance)
+    else: xt = x0.where(mask==1, torch.full_like(x0, self.tokenizer.mask_token_id))
+    model_output = self.forward(xt, unet_conditioning, attention_mask)
+    # print(self.tokenizer.decode(torch.argmax(model_output[0], dim=-1)))
+    utils.print_nans(model_output, 'model_output')
+    if self.parameterization == 'sedd':
+      return dsigma[:, None] * self._score_entropy(
+        model_output, sigma[:, None], xt, x0)
+    if self.T > 0:
+      diffusion_loss = self._d3pm_loss(
+        model_output=model_output, xt=xt, x0=x0, t=t)
+      if self.parameterization == 'd3pm':
+        reconstruction_loss = self._reconstruction_loss(x0)
+      elif self.parameterization == 'subs':
+        reconstruction_loss = 0
+      return reconstruction_loss + diffusion_loss
+    # SUBS parameterization, continuous time.
+    log_p_theta = torch.gather(
+      input=model_output,
+      dim=-1,
+      index=x0[:, :, None]).squeeze(-1)
+    if self.change_of_variables or self.importance_sampling:
+      return log_p_theta * torch.log1p(
+        - torch.exp(- self.noise.sigma_min))
+    return - log_p_theta * (
+      dsigma / torch.expm1(sigma))[:, None]
+  def _loss(self, x0, attention_mask, mask=None):
+    (input_tokens, output_tokens,
+     attention_mask) = self._maybe_sub_sample(
+       x0, attention_mask)
+    if self.parameterization == 'ar':
+      logprobs = self.backbone(input_tokens, None, attention_mask)
+      loss = - logprobs.gather(
+        -1, output_tokens[:, :, None])[:, :, 0]
+    else:
+      loss = self._forward_pass_diffusion(input_tokens, attention_mask, mask)
+    nlls = loss * attention_mask
+    count = attention_mask.sum()
+    batch_nll = nlls.sum()
+    token_nll = batch_nll / count
+    return Loss(loss=token_nll,
+                nlls=nlls,
+                token_mask=attention_mask)
+  def _score_entropy(self, log_score, sigma, xt, x0):
+    """Computes the SEDD loss.
+    Args:
+      log_score: float torch.Tensor with shape (batch_size,
+          diffusion_model_input_length, vocab_size),
+          log score, output of the denoising network.
+      xt: int torch.Tensor with shape (batch_size,
+          diffusion_model_input_length), input.
+      x0: int torch.Tensor with shape (batch_size,
+          diffusion_model_input_length), input.
+      sigma: float torch.Tensor with shape (batch_size, 1).
+    Returns:
+      loss with shape (batch_size, diffusion_model_input_length)
+    """
+    masked_indices = xt == self.mask_index
+    expsig_minus_1 = torch.expm1(sigma).expand_as(xt)
+    q_ratio = 1 / expsig_minus_1[masked_indices]
+    words_that_were_masked = x0[masked_indices]
+    neg_term = q_ratio * torch.gather(
+      log_score[masked_indices],
+      -1,
+      words_that_were_masked[..., None]).squeeze(-1)
+    score = log_score[masked_indices].exp()
+    if self.mask_index == self.vocab_size - 1:
+      pos_term = score[:, :-1].sum(dim=-1)
+    else:
+      pos_term = score[:, : self.mask_index].sum(
+        dim=-1) + score[:, self.mask_index + 1:].sum(dim=-1)
+    const = q_ratio * (q_ratio.log() - 1)
+    entropy = torch.zeros(* xt.shape, device=xt.device)
+    entropy[masked_indices] += pos_term - neg_term + const
+    return entropy
+  @torch.no_grad
+  def sample_subs_guidance(
+    self, n_samples, stride_length, num_strides, dt=0.001):
+    ones = torch.ones(n_samples, dtype=self.dtype,
+                      device=self.device)
+    num_steps = int(1 / dt)
+    sampling_steps = 0
+    intermediate_tokens = []
+    target = None
+    for _ in range(num_strides + 1):
+      p_x0_cache = None
+      x = self._sample_prior(
+        n_samples,
+        self.config.model.length).to(self.device)
+      if target is not None:
+        x[:, : -stride_length] = target
+      for i in range(num_steps + 1):
+        p_x0_cache, x_next = self._ddpm_caching_update(
+          x=x, t=(1 - i * dt) * ones, dt=dt, p_x0=p_x0_cache)
+        if (not torch.allclose(x_next, x)
+            or self.time_conditioning):
+          p_x0_cache = None
+          sampling_steps += 1
+        x = x_next
+      x = self.forward(x, 0 * ones).argmax(dim=-1)
+      intermediate_tokens.append(
+        x[:, :stride_length].cpu().numpy())
+      target = x[:, stride_length:]
+    intermediate_tokens.append(target.cpu().numpy())
+    intermediate_text_samples = []
+    sequence_lengths = ((
+      np.concatenate(intermediate_tokens, axis=1)[:, 1:]
+      == self.tokenizer.eos_token_id).cumsum(-1) == 0).sum(-1)
+    for i in range(2, len(intermediate_tokens) + 1):
+      intermediate_text_samples.append(
+        self.tokenizer.batch_decode(
+          np.concatenate(intermediate_tokens[:i], axis=1)))
+    return (sampling_steps, intermediate_text_samples,
+            sequence_lengths)
+  def restore_model_and_semi_ar_sample(
+      self, stride_length, num_strides, dt=0.001):
+    """Generate samples from the model."""
+    # Lightning auto-casting is not working in this method for some reason
+    # params_with_grad = [p for p in itertools.chain(
+    #   self.backbone.parameters(),
+    #   self.noise.parameters()
+    # ) if p]
+    if self.ema:
+      self.ema.store(itertools.chain(self.backbone.parameters(),
+                                     self.noise.parameters()))
+      self.ema.copy_to(itertools.chain(self.backbone.parameters(),
+                                     self.noise.parameters()))
+    self.backbone.eval()
+    self.noise.eval()
+    (sampling_steps, samples,
+     sequence_lengths) = self.sample_subs_guidance(
+      n_samples=self.config.loader.eval_batch_size,
+      stride_length=stride_length,
+      num_strides=num_strides,
+      dt=dt)
+    if self.ema:
+      self.ema.restore(itertools.chain(self.backbone.parameters(),
+                                     self.noise.parameters()))
+    self.backbone.train()
+    self.noise.train()
+    return sampling_steps, samples, sequence_lengths

dit.py ADDED Viewed

	@@ -0,0 +1,388 @@

+import math
+import typing
+import flash_attn
+import flash_attn.layers.rotary
+import huggingface_hub
+import omegaconf
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import AutoModel
+# Flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+def bias_dropout_add_scale(
+    x: torch.Tensor,
+    bias: typing.Optional[torch.Tensor],
+    scale: torch.Tensor,
+    residual: typing.Optional[torch.Tensor],
+    prob: float,
+    training: bool) -> torch.Tensor:
+  if bias is not None:
+    out = scale * F.dropout(x + bias, p=prob, training=training)
+  else:
+    out = scale * F.dropout(x, p=prob, training=training)
+  if residual is not None:
+    out = residual + out
+  return out
+def get_bias_dropout_add_scale(training):
+  def _bias_dropout_add(x, bias, scale, residual, prob):
+    return bias_dropout_add_scale(
+      x, bias, scale, residual, prob, training)
+  return _bias_dropout_add
+# function overload
+def modulate(x: torch.Tensor,
+             shift: torch.Tensor,
+             scale: torch.Tensor) -> torch.Tensor:
+  return x * (1 + scale) + shift
+@torch.jit.script
+def bias_dropout_add_scale_fused_train(
+    x: torch.Tensor,
+    bias: typing.Optional[torch.Tensor],
+    scale: torch.Tensor,
+    residual: typing.Optional[torch.Tensor],
+    prob: float) -> torch.Tensor:
+  return bias_dropout_add_scale(
+    x, bias, scale, residual, prob, True)
+@torch.jit.script
+def bias_dropout_add_scale_fused_inference(
+    x: torch.Tensor,
+    bias: typing.Optional[torch.Tensor],
+    scale: torch.Tensor,
+    residual: typing.Optional[torch.Tensor],
+    prob: float) -> torch.Tensor:
+  return bias_dropout_add_scale(
+    x, bias, scale, residual, prob, False)
+@torch.jit.script
+def modulate_fused(x: torch.Tensor,
+                   shift: torch.Tensor,
+                   scale: torch.Tensor) -> torch.Tensor:
+  return modulate(x, shift, scale)
+class Rotary(torch.nn.Module):
+  def __init__(self, dim, base=10_000):
+    super().__init__()
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+    self.register_buffer('inv_freq', inv_freq)
+    self.seq_len_cached = None
+    self.cos_cached = None
+    self.sin_cached = None
+  def forward(self, x, seq_dim=1):
+    seq_len = x.shape[seq_dim]
+    if seq_len != self.seq_len_cached:
+      self.seq_len_cached = seq_len
+      t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq)
+      freqs = torch.einsum("i,j->ij", t, self.inv_freq.clone())
+      emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+      # dims are: batch, seq_len, qkv, head, dim
+      self.cos_cached = emb.cos()[None, :, None, None, :].repeat(1,1,3,1,1)
+      self.sin_cached = emb.sin()[None, :, None, None, :].repeat(1,1,3,1,1)
+      # This makes the transformation on v an identity.
+      self.cos_cached[:,:,2,:,:].fill_(1.)
+      self.sin_cached[:,:,2,:,:].fill_(0.)
+    return self.cos_cached, self.sin_cached
+def rotate_half(x):
+  x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+  return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(qkv, cos, sin):
+  cos = cos[0,:,0,0,:cos.shape[-1]//2]
+  sin = sin[0,:,0,0,:sin.shape[-1]//2]
+  return flash_attn.layers.rotary.apply_rotary_emb_qkv_(qkv, cos, sin)
+# function overload
+def modulate(x, shift, scale):
+  return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#                                  Layers                                       #
+#################################################################################
+class LayerNorm(nn.Module):
+  def __init__(self, dim):
+    super().__init__()
+    self.weight = nn.Parameter(torch.ones([dim]))
+    self.dim = dim
+  def forward(self, x):
+    with torch.cuda.amp.autocast(enabled=False):
+      x = F.layer_norm(x.float(), [self.dim])
+    return x * self.weight[None,None,:]
+def residual_linear(x, W, x_skip, residual_scale):
+  """x_skip + residual_scale * W @ x"""
+  dim_out, dim_in = W.shape[0], W.shape[1]
+  return torch.addmm(
+    x_skip.view(-1, dim_out),
+    x.view(-1, dim_in),
+    W.T,
+    alpha=residual_scale).view(*x.shape[:-1], dim_out)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+  """
+  Embeds scalar timesteps into vector representations.
+  """
+  def __init__(self, hidden_size, frequency_embedding_size=256):
+    super().__init__()
+    self.mlp = nn.Sequential(
+      nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+      nn.SiLU(),
+      nn.Linear(hidden_size, hidden_size, bias=True))
+    self.frequency_embedding_size = frequency_embedding_size
+  @staticmethod
+  def timestep_embedding(t, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    half = dim // 2
+    freqs = torch.exp(- math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(device=t.device)
+    if t.ndim == 1:
+      t = t.unsqueeze(1)
+    args = t.float() * freqs[None, :]
+    #args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+      embedding = torch.cat(
+        [embedding,
+         torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+  def forward(self, t):
+    t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+    t_emb = self.mlp(t_freq)
+    return t_emb
+class LabelEmbedder(nn.Module):
+  """Embeds class labels into vector representations.
+  Also handles label dropout for classifier-free guidance.
+  """
+  def __init__(self, num_classes, cond_size):
+    super().__init__()
+    self.embedding_table = nn.Embedding(num_classes + 1, cond_size)
+    self.num_classes = num_classes
+    # TODO think of initializing with 0.02 std deviation like in original DiT paper
+  def forward(self, labels):
+    embeddings = self.embedding_table(labels)
+    return embeddings
+#################################################################################
+#                                 Core Model                                    #
+#################################################################################
+class DDiTBlock(nn.Module):
+  def __init__(self, dim, n_heads, cond_dim, mlp_ratio=4, dropout=0.1):
+    super().__init__()
+    self.n_heads = n_heads
+    self.norm1 = LayerNorm(dim)
+    self.attn_qkv = nn.Linear(dim, 3 * dim, bias=False)
+    self.attn_out = nn.Linear(dim, dim, bias=False)
+    self.dropout1 = nn.Dropout(dropout)
+    self.norm2 = LayerNorm(dim)
+    self.mlp = nn.Sequential(
+      nn.Linear(dim, mlp_ratio * dim, bias=True),
+      nn.GELU(approximate='tanh'),
+      nn.Linear(mlp_ratio * dim, dim, bias=True))
+    self.dropout2 = nn.Dropout(dropout)
+    self.dropout = dropout
+    self.adaLN_modulation = nn.Linear(cond_dim, 6 * dim, bias=True)
+    self.adaLN_modulation.weight.data.zero_()
+    self.adaLN_modulation.bias.data.zero_()
+  def _get_bias_dropout_scale(self):
+    if self.training:
+      return bias_dropout_add_scale_fused_train
+    else:
+      return bias_dropout_add_scale_fused_inference
+  def forward(self, x, rotary_cos_sin, c, seqlens=None):
+    batch_size, seq_len = x.shape[0], x.shape[1]
+    bias_dropout_scale_fn = self._get_bias_dropout_scale()
+    (shift_msa, scale_msa, gate_msa, shift_mlp,
+     scale_mlp, gate_mlp) = self.adaLN_modulation(c)[:, None].chunk(6, dim=2)
+    # attention operation
+    x_skip = x
+    x = modulate_fused(self.norm1(x), shift_msa, scale_msa)
+    qkv = self.attn_qkv(x)
+    qkv = rearrange(qkv,
+                    'b s (three h d) -> b s three h d',
+                    three=3,
+                    h=self.n_heads)
+    with torch.cuda.amp.autocast(enabled=False):
+      cos, sin = rotary_cos_sin
+      qkv = apply_rotary_pos_emb(
+        qkv, cos.to(qkv.dtype), sin.to(qkv.dtype))
+    qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+    if seqlens is None:
+      cu_seqlens = torch.arange(
+        0, (batch_size + 1) * seq_len, step=seq_len,
+        dtype=torch.int32, device=qkv.device)
+    else:
+      cu_seqlens = seqlens.cumsum(-1)
+    x = flash_attn.flash_attn_interface.flash_attn_varlen_qkvpacked_func(
+      qkv, cu_seqlens, seq_len, 0., causal=False)
+    x = rearrange(x, '(b s) h d -> b s (h d)', b=batch_size)
+    x = bias_dropout_scale_fn(self.attn_out(x),
+                              None,
+                              gate_msa,
+                              x_skip,
+                              self.dropout)
+    # mlp operation
+    x = bias_dropout_scale_fn(
+      self.mlp(modulate_fused(
+        self.norm2(x), shift_mlp, scale_mlp)),
+      None, gate_mlp, x, self.dropout)
+    return x
+class EmbeddingLayer(nn.Module):
+  def __init__(self, dim, vocab_dim):
+    super().__init__()
+    self.embedding = nn.Parameter(torch.empty((vocab_dim, dim)))
+    torch.nn.init.kaiming_uniform_(self.embedding, a=math.sqrt(5))
+  def forward(self, x):
+    return self.embedding[x]
+class DDitFinalLayer(nn.Module):
+  def __init__(self, hidden_size, out_channels, cond_dim):
+    super().__init__()
+    self.norm_final = LayerNorm(hidden_size)
+    self.linear = nn.Linear(hidden_size, out_channels)
+    self.linear.weight.data.zero_()
+    self.linear.bias.data.zero_()
+    self.adaLN_modulation = nn.Linear(cond_dim,
+                                      2 * hidden_size,
+                                      bias=True)
+    self.adaLN_modulation.weight.data.zero_()
+    self.adaLN_modulation.bias.data.zero_()
+  def forward(self, x, c):
+    shift, scale = self.adaLN_modulation(c)[:, None].chunk(2, dim=2)
+    x = modulate_fused(self.norm_final(x), shift, scale)
+    x = self.linear(x)
+    return x
+class DIT(nn.Module, huggingface_hub.PyTorchModelHubMixin):
+  def __init__(self, config, vocab_size: int, mlm_model_path):
+    super().__init__()
+    if type(config) == dict:
+      config = omegaconf.OmegaConf.create(config)
+    self.config = config
+    self.vocab_size = vocab_size
+    self.vocab_embed = EmbeddingLayer(config.model.hidden_size,
+                                      vocab_size)
+    self.sigma_map = TimestepEmbedder(config.model.cond_dim)
+    self.rotary_emb = Rotary(
+      config.model.hidden_size // config.model.n_heads)
+    blocks = []
+    for _ in range(config.model.n_blocks):
+      blocks.append(DDiTBlock(config.model.hidden_size,
+                              config.model.n_heads,
+                              config.model.cond_dim,
+                              dropout=config.model.dropout))
+    self.blocks = nn.ModuleList(blocks)
+    self.output_layer = DDitFinalLayer(
+      config.model.hidden_size,
+      vocab_size,
+      config.model.cond_dim)
+    self.scale_by_sigma = config.model.scale_by_sigma
+    self.mlm_model = AutoModel.from_pretrained(mlm_model_path, device_map='cpu')
+  def _get_bias_dropout_scale(self):
+    if self.training:
+      return bias_dropout_add_scale_fused_train
+    else:
+      return  bias_dropout_add_scale_fused_inference
+  def forward(self, indices, sigma):
+    x = self.vocab_embed(indices)
+    c_sigma = F.silu(self.sigma_map(sigma))
+    rotary_cos_sin = self.rotary_emb(x)
+    with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+      for i in range(len(self.blocks)):
+        x = self.blocks[i](x, rotary_cos_sin, c_sigma, seqlens=None)
+      x = self.output_layer(x, c_sigma)
+    # Extract membrane-specific embeddings from final encoder layer
+    # of fine-tuned ESM model
+    # with torch.no_grad():
+    #   membrane_embedding = self.mlm_model(input_ids=, attention_mask=).last_hidden_state.squeeze(0)
+    # Fuse MLM embeddings with conditioning vector
+    # c = torch.cat([c_sigma, membrane_embedding], dim=-1)
+    # print(membrane_embedding.size())
+    # print(c_sigma.size())
+    return x

ema.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+class ExponentialMovingAverage:
+  """
+  Maintains (exponential) moving average of a set of parameters.
+  """
+  def __init__(self, parameters, decay, use_num_updates=True):
+    """
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; usually the result of
+            `model.parameters()`.
+        decay: The exponential decay.
+        use_num_updates: Whether to use number of updates when computing
+            averages.
+    """
+    if decay < 0.0 or decay > 1.0:
+      raise ValueError('Decay must be between 0 and 1')
+    self.decay = decay
+    self.num_updates = 0 if use_num_updates else None
+    self.shadow_params = [p.clone().detach()
+                          for p in parameters if p.requires_grad]
+    self.collected_params = []
+  def move_shadow_params_to_device(self, device):
+    self.shadow_params = [i.to(device) for i in self.shadow_params]
+  def update(self, parameters):
+    """
+    Update currently maintained parameters.
+    Call this every time the parameters are updated, such as the result of
+    the `optimizer.step()` call.
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; usually the same set of
+            parameters used to initialize this object.
+    """
+    decay = self.decay
+    if self.num_updates is not None:
+      self.num_updates += 1
+      decay = min(decay, (1 + self.num_updates) /
+                  (10 + self.num_updates))
+    one_minus_decay = 1.0 - decay
+    with torch.no_grad():
+      parameters = [p for p in parameters if p.requires_grad]
+      for s_param, param in zip(self.shadow_params, parameters):
+        s_param.sub_(one_minus_decay * (s_param - param))
+  def copy_to(self, parameters):
+    """
+    Copy current parameters into given collection of parameters.
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored moving averages.
+    """
+    parameters = [p for p in parameters if p.requires_grad]
+    for s_param, param in zip(self.shadow_params, parameters):
+      if param.requires_grad:
+        param.data.copy_(s_param.data)
+  def store(self, parameters):
+    """
+    Save the current parameters for restoring later.
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            temporarily stored.
+    """
+    self.collected_params = [param.clone() for param in parameters]
+  def restore(self, parameters):
+    """
+    Restore the parameters stored with the `store` method.
+    Useful to validate the model with EMA parameters without affecting the
+    original optimization process. Store the parameters before the
+    `copy_to` method. After validation (or model saving), use this to
+    restore the former parameters.
+    Args:
+        parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored parameters.
+    """
+    for c_param, param in zip(self.collected_params, parameters):
+      param.data.copy_(c_param.data)
+  def state_dict(self):
+    return dict(decay=self.decay,
+                num_updates=self.num_updates,
+                shadow_params=self.shadow_params)
+  def load_state_dict(self, state_dict):
+    self.decay = state_dict['decay']
+    self.num_updates = state_dict['num_updates']
+    self.shadow_params = state_dict['shadow_params']

esm_utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+import config
+from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
+def load_esm2_model(model_name):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    masked_model = AutoModelForMaskedLM.from_pretrained(model_name)
+    embedding_model = AutoModel.from_pretrained(model_name)
+    return tokenizer, masked_model, embedding_model
+def get_latents(model, tokenizer, sequence, device):
+    inputs = tokenizer(sequence, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**inputs).last_hidden_state.squeeze(0)
+    return outputs

generate.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import torch
+import torch.nn.functional as F
+import math
+import random
+import sys
+import pandas as pd
+from mlm_generate_utils import mask_for_de_novo, calculate_cosine_sim, calculate_hamming_dist
+from diffusion import Diffusion
+import hydra
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModel, pipeline
+@torch.no_grad()
+def generate_sequence(sequence_length: int, tokenizer, mdlm: Diffusion):
+    global masked_sequence
+    masked_sequence = mask_for_de_novo(sequence_length)
+    inputs = tokenizer(masked_sequence, return_tensors="pt").to(mdlm.device)
+    logits = mdlm._sample(x_input=inputs) # using sample, change config.sampling.steps to determine robustness
+    generated_sequence = tokenizer.decode(logits.squeeze())
+    return generated_sequence
+@hydra.main(version_base=None, config_path='configs', config_name='config')
+def mdlm_motif_benchmark(config):
+    path = "/workspace/sg666/MDpLM"
+    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t30_150M_UR50D")
+    mdlm_model = Diffusion.load_from_checkpoint(config.eval.checkpoint_path, config=config, tokenizer=tokenizer)
+    mdlm_model.eval()
+    device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
+    mdlm_model.to(device)
+    print("loaded models...")
+    # Get 100 random sequence lengths to generate
+    sequence_lengths = [random.randint(50, 1000) for _ in range(100)]
+    generation_results = []
+    for seq_length in tqdm(sequence_lengths, desc=f"Generating sequences: "):
+        generated_sequence = generate_sequence(seq_length, tokenizer, mdlm_model)
+        generated_sequence = generated_sequence[5:-5].replace(" ", "") # Remove bos/eos tokens
+        perplexity = mdlm_model.compute_masked_perplexity([generated_sequence], masked_sequence)
+        perplexity = round(perplexity, 4)
+        generation_results.append([generated_sequence, perplexity])
+        print(f"perplexity: {perplexity} | length: {seq_length} | generated sequence: {generated_sequence}")
+        sys.stdout.flush()
+        df = pd.DataFrame(generation_results, columns=['Generated Sequence', 'Perplexity'])
+        df.to_csv(path + f'/benchmarks/mdlm_de-novo_generation_results.csv', index=False)
+if __name__ == "__main__":
+    mdlm_motif_benchmark()

main.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import os
+import wandb
+import fsspec
+import hydra
+import lightning as L
+import omegaconf
+import rich.syntax
+import rich.tree
+import torch
+import pl_data_loader as dataloader
+from diffusion import Diffusion
+import utils
+from lightning.pytorch.strategies import DDPStrategy
+from transformers import AutoTokenizer
+from datasets import load_from_disk, load_dataset
+#wandb.login(key="2b76a2fa2c1cdfddc5f443602c17b011fefb0a8f")
+omegaconf.OmegaConf.register_new_resolver(
+  'cwd', os.getcwd)
+omegaconf.OmegaConf.register_new_resolver(
+  'device_count', torch.cuda.device_count)
+omegaconf.OmegaConf.register_new_resolver(
+  'eval', eval)
+omegaconf.OmegaConf.register_new_resolver(
+  'div_up', lambda x, y: (x + y - 1) // y)
+def _load_from_checkpoint(config, tokenizer):
+  if 'hf' in config.backbone:
+    return Diffusion(
+      config, tokenizer=tokenizer).to('cuda')
+  else:
+    model= Diffusion.load_from_checkpoint(
+      config.eval.checkpoint_path,
+      tokenizer=tokenizer,
+      config=config)
+  return model
+@L.pytorch.utilities.rank_zero_only
+def _print_config(
+  config: omegaconf.DictConfig,
+  resolve: bool = True,
+  save_cfg: bool = True) -> None:
+  """Prints content of DictConfig using Rich library and its tree structure.
+  Args:
+    config (DictConfig): Configuration composed by Hydra.
+    resolve (bool): Whether to resolve reference fields of DictConfig.
+    save_cfg (bool): Whether to save the configuration tree to a file.
+  """
+  style = 'dim'
+  tree = rich.tree.Tree('CONFIG', style=style, guide_style=style)
+  fields = config.keys()
+  for field in fields:
+    branch = tree.add(field, style=style, guide_style=style)
+    config_section = config.get(field)
+    branch_content = str(config_section)
+    if isinstance(config_section, omegaconf.DictConfig):
+      branch_content = omegaconf.OmegaConf.to_yaml(
+        config_section, resolve=resolve)
+    branch.add(rich.syntax.Syntax(branch_content, 'yaml'))
+  rich.print(tree)
+  if save_cfg:
+    with fsspec.open(
+      '{}/config_tree.txt'.format(
+        config.checkpointing.save_dir), 'w') as fp:
+      rich.print(tree, file=fp)
+@L.pytorch.utilities.rank_zero_only
+def _print_batch(train_ds, valid_ds, tokenizer, k=64):
+  #for dl_type, dl in [
+    #('train', train_ds), ('valid', valid_ds)]:
+  for dl_type, dl in [
+    ('train', train_ds)]:
+    print(f'Printing {dl_type} dataloader batch.')
+    batch = next(iter(dl))
+    print('Batch input_ids.shape', batch['input_ids'].shape)
+    first = batch['input_ids'][0, :k]
+    last = batch['input_ids'][0, -k:]
+    print(f'First {k} tokens:', tokenizer.decode(first))
+    print('ids:', first)
+    print(f'Last {k} tokens:', tokenizer.decode(last))
+    print('ids:', last)
+def generate_samples(config, logger, tokenizer):
+  logger.info('Generating samples.')
+  model = _load_from_checkpoint(config=config,
+                                tokenizer=tokenizer)
+  model.gen_ppl_metric.reset()
+  if config.eval.disable_ema:
+    logger.info('Disabling EMA.')
+    model.ema = None
+  stride_length = config.sampling.stride_length
+  num_strides = config.sampling.num_strides
+  for _ in range(config.sampling.num_sample_batches):
+    if config.sampling.semi_ar:
+      _, intermediate_samples, _ = model.restore_model_and_semi_ar_sample(
+        stride_length=stride_length,
+        num_strides=num_strides,
+        dt=1 / config.sampling.steps)
+      text_samples = intermediate_samples[-1]
+      # Note: Samples generated using semi-ar method
+      # need to to be processed before computing generative perplexity
+      # since these samples contain numerous <|endoftext|> tokens
+      # and diffusion.compute_generative_perplexity() discards
+      # any text after the first EOS token.
+    else:
+      samples = model.restore_model_and_sample(
+        num_steps=config.sampling.steps)
+      text_samples = model.tokenizer.batch_decode(samples)
+      model.compute_generative_perplexity(text_samples)
+  print('Text samples:', text_samples)
+  if not config.sampling.semi_ar:
+    print('Generative perplexity:',
+          model.gen_ppl_metric.compute())
+  return text_samples
+def _ppl_eval(config, logger, tokenizer, data_module):
+  logger.info('Starting Zero Shot Eval.')
+  model = _load_from_checkpoint(config=config,
+                                tokenizer=tokenizer)
+  if config.eval.disable_ema:
+    logger.info('Disabling EMA.')
+    model.ema = None
+  wandb_logger = None
+  if config.get('wandb', None) is not None:
+    wandb_logger = L.pytorch.loggers.WandbLogger(
+      config=omegaconf.OmegaConf.to_object(config),
+      ** config.wandb)
+  callbacks = []
+  if 'callbacks' in config:
+    for _, callback in config.callbacks.items():
+      callbacks.append(hydra.utils.instantiate(callback))
+  trainer = hydra.utils.instantiate(
+    config.trainer,
+    default_root_dir=os.getcwd(),
+    callbacks=callbacks,
+    strategy=DDPStrategy(find_unused_parameters=True),
+    logger=wandb_logger)
+  # _, valid_ds = dataloader.get_dataloaders(
+  #   config, tokenizer, skip_train=True, valid_seed=config.seed)
+  trainer.test(model, data_module)
+def _train(config, logger, tokenizer, data_module):
+  logger.info('Starting Training.')
+  wandb_logger = None
+  if config.get('wandb', None) is not None:
+    wandb_logger = L.pytorch.loggers.WandbLogger(
+      config=omegaconf.OmegaConf.to_object(config),
+      ** config.wandb)
+  if (config.checkpointing.resume_from_ckpt
+      and config.checkpointing.resume_ckpt_path is not None
+      and utils.fsspec_exists(
+        config.checkpointing.resume_ckpt_path)):
+    ckpt_path = config.checkpointing.resume_ckpt_path
+  else:
+    ckpt_path = None
+  # Lightning callbacks
+  callbacks = []
+  if 'callbacks' in config:
+    for _, callback in config.callbacks.items():
+      callbacks.append(hydra.utils.instantiate(callback))
+  '''
+  train_ds, valid_ds = dataloader.get_dataloaders(
+    config, tokenizer)
+  _print_batch(train_ds, valid_ds, tokenizer)
+  model = diffusion.Diffusion(
+    config, tokenizer=valid_ds.tokenizer)
+  '''
+  trainer = hydra.utils.instantiate(
+    config.trainer,
+    default_root_dir=os.getcwd(),
+    callbacks=callbacks,
+    accelerator='cuda',
+    strategy=DDPStrategy(find_unused_parameters=True),
+    logger=wandb_logger)
+  model = Diffusion(
+    config, tokenizer=tokenizer)
+  trainer.fit(model, datamodule=data_module, ckpt_path=ckpt_path)
+  '''
+  trainer.fit(model, train_ds, valid_ds, ckpt_path=ckpt_path)
+  '''
+@hydra.main(version_base=None, config_path='configs', config_name='config')
+def main(config):
+  """Main entry point for training."""
+  L.seed_everything(config.seed)
+  _print_config(config, resolve=True, save_cfg=True)
+  logger = utils.get_logger(__name__)
+  tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
+  if config.backbone == "vanilla_esm_pretrain":
+    train_dataset = load_dataset('csv', data_files=config.data.train.vanilla_esm_train_path)
+    val_dataset = load_dataset('csv', data_files=config.data.valid.vanilla_esm_valid_path)
+    test_dataset = load_dataset('csv', data_files=config.data.test.vanilla_esm_test_path)
+  elif config.backbone == "membrane_esm_finetune" or config.backbone == "dit":
+    train_dataset = load_dataset('csv', data_files=config.data.train.membrane_esm_train_path)
+    val_dataset = load_dataset('csv', data_files=config.data.valid.membrane_esm_valid_path)
+    test_dataset = load_dataset('csv', data_files=config.data.test.membrane_esm_test_path)
+  lst = [i for i in range(1, 200)]
+  train_dataset = train_dataset['train']#.select(lst)
+  val_dataset = val_dataset['train']#.select(lst)
+  test_dataset = test_dataset['train']#.select(lst)
+  if config.training.focus_mask :
+    collator = dataloader.membrane_collate_fn
+  elif config.data.wrapping:
+    collator = dataloader.wrap_collate_fn
+  else:
+    collator = collate_fn
+  data_module = dataloader.CustomDataModule(
+    train_dataset, val_dataset, test_dataset,
+    tokenizer,
+    batch_size=config.loader.batch_size,
+    collate_fn=collator
+   )
+  if config.mode == 'sample_eval':
+    generate_samples(config, logger, tokenizer)
+  elif config.mode == 'ppl_eval':
+    _ppl_eval(config, logger, tokenizer, data_module)
+  else:
+    _train(config, logger, tokenizer, data_module)
+if __name__ == '__main__':
+  main()

mdlm_motif_benchmarking.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import torch
+import torch.nn.functional as F
+import math
+import random
+import sys
+import pandas as pd
+from mlm_generate_utils import mask_for_scaffold, calculate_cosine_sim, calculate_hamming_dist
+from diffusion import Diffusion
+import hydra
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModel, pipeline
+def masking_test(sequence: str, generate_case: str, tokenizer, mask_prob: float = 0.50):
+    """
+    Masks 50% of the tokens in the sequence.
+    """
+    tokens = list(sequence.upper())
+    num_tokens_to_mask = int(mask_prob * len(tokens))  # Select some fraction of the tokens
+    print(num_tokens_to_mask,len(tokens))
+    # Get random indices to mask
+    mask_indices = random.sample(range(len(tokens)), num_tokens_to_mask)
+    for idx in mask_indices:
+        tokens[idx] = tokenizer.mask_token  # Replace with mask token
+    return ''.join(tokens)
+@torch.no_grad()
+def generate_scaffold_mdlm(sequence: str, generate_case: str, tokenizer, mdlm: Diffusion):
+    # # Mask soluble or transmembrane domains
+    # masked_sequence = mask_for_scaffold(sequence, generate_case)
+    # # Test out different masking rates
+    # masked_sequence = masking_test(sequence, generate_case, tokenizer)
+    # 100% masking rate, de novo generation
+    masked_sequence = len(sequence) * "<mask>"
+    print(masked_sequence)
+    inputs = tokenizer(masked_sequence, return_tensors="pt").to(mdlm.device)
+    logits = mdlm._sample(x_input=inputs) # using sample, change config.sampling.steps to determine robustness
+    # logits = mdlm.forward(inputs)
+    # print(tokenizer.decode(logits.squeeze(), skip_special_tokens=True))
+    return tokenizer.decode(logits.squeeze()), masked_sequence
+@hydra.main(version_base=None, config_path='configs', config_name='config')
+def mdlm_motif_benchmark(config):
+    path = "/workspace/sg666/MDpLM"
+    test_sequences = pd.read_csv(path + "/data/membrane/test.csv")['Sequence'].tolist()
+    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
+    mdlm_model = Diffusion.load_from_checkpoint(config.eval.checkpoint_path, config=config, tokenizer=tokenizer)
+    esm_model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D") # model used for functionality testing
+    mdlm_model.eval()
+    esm_model.eval()
+    print("loaded models...")
+    device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
+    mdlm_model.to(device)
+    esm_model.to(device)
+    for generate_case in ['uppercase', 'lowercase']:
+        case_results = []
+        for original_sequence in tqdm(test_sequences, desc=f"scaffolding ({generate_case}): "):
+            generated_sequence, masked_input = generate_scaffold_mdlm(original_sequence, generate_case, tokenizer, mdlm_model)
+            generated_sequence = generated_sequence[5:-5].replace(" ", "") # Remove bos/eos tokens
+            perplexity = mdlm_model.compute_masked_perplexity([original_sequence], masked_input)
+            cos_sim = calculate_cosine_sim(original_sequence, generated_sequence, tokenizer, esm_model, device)
+            hamming_distance = calculate_hamming_dist(original_sequence, generated_sequence)
+            case_results.append([original_sequence, generated_sequence, perplexity, cos_sim, hamming_distance])
+            print("perplexity: ", perplexity, "cos sim: ", cos_sim, "hamming: ", hamming_distance)
+            print(f"generated sequence: {generated_sequence}")
+            print(f"original sequence: {original_sequence.upper()}")
+            sys.stdout.flush()
+        df = pd.DataFrame(case_results, columns=['Original Sequence', 'Generated Sequence', 'Perplexity', 'Cosine Similarity', 'Hamming Distance'])
+        df.to_csv(path + f'/benchmarks/MLM/mlm_{generate_case}_results.csv', index=False)
+if __name__ == "__main__":
+    mdlm_motif_benchmark()

mlm_generate_utils.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import torch
+import math
+import config
+import sys
+import pandas as pd
+from esm_utils import get_latents
+from transformers import AutoModelForMaskedLM, AutoModel, AutoTokenizer
+def mask_for_de_novo(sequence_length):
+    return "<mask>" * sequence_length
+def generate_de_novo(sequence_length, tokenizer, model):
+    masked_sequence = mask_for_de_novo(sequence_length)
+    inputs = tokenizer(masked_sequence, return_tensors='pt').to(model.device)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
+    logits_at_masks = logits[0, mask_token_indices]
+    pred_tokens = []
+    for i in mask_token_indices:
+        topk_logits, topk_indices = logits_at_masks[i].topk(k=3, dim=-1)
+        probabilities = torch.nn.functional.softmax(topk_logits, dim=-1)
+        predicted_index = torch.distributions.categorical.Categorical(probabilities).sample()
+        predicted_token_id = topk_indices[predicted_index].item()
+        predicted_token = tokenizer.decode([predicted_token_id], skip_special_tokens=True)
+        pred_tokens.append(predicted_token)
+    generated_sequence = ''.join(pred_tokens)
+    perplexity = calculate_perplexity(model, tokenizer, generated_sequence)
+    return (generated_sequence, perplexity)
+def mask_for_scaffold(sequence, generate_type):
+    if generate_type == "uppercase":
+        sequence = ''.join(["<mask>" if residue.isupper() else residue.upper() for residue in sequence])
+    elif generate_type == "lowercase":
+        sequence = ''.join(["<mask>" if residue.islower() else residue for residue in sequence])
+    return sequence
+def generate_scaffold(sequence, generate_type, tokenizer, model):
+    masked_sequence = mask_for_scaffold(sequence, generate_type)
+    inputs = tokenizer(masked_sequence, return_tensors='pt').to(model.device)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
+    logits_at_masks = logits[0, mask_token_indices]
+    pred_tokens = []
+    for i in range(len(mask_token_indices)):
+        topk_logits, topk_indices = logits_at_masks[i].topk(k=3, dim=-1)
+        probabilities = torch.nn.functional.softmax(topk_logits, dim=-1)
+        predicted_index = torch.distributions.categorical.Categorical(probabilities).sample()
+        predicted_token_id = topk_indices[predicted_index].item()
+        predicted_token = tokenizer.decode([predicted_token_id], skip_special_tokens=True)
+        pred_tokens.append('G' if predicted_token == '' else predicted_token)
+    generated_sequence = masked_sequence
+    for token in pred_tokens:
+        generated_sequence = generated_sequence.replace("<mask>", token, 1)
+    return generated_sequence, mask_token_indices
+def calculate_perplexity(model, tokenizer, generated_sequence, mask_token_indices):
+    total_loss = 0.0
+    tensor_input = tokenizer.encode(generated_sequence, return_tensors='pt').to(model.device)
+    for i in mask_token_indices:
+        masked_input = tensor_input.clone()
+        masked_input[0, i] = tokenizer.mask_token_id
+        labels = torch.full(tensor_input.shape, -100).to(model.device)
+        labels[0, i] = tensor_input[0, i]
+        with torch.no_grad():
+            outputs = model(masked_input, labels=labels)
+            total_loss += outputs.loss.item()
+    num_mask_tokens = len(mask_token_indices)
+    if num_mask_tokens == 0:
+        perplexity = 10000
+    else:
+        avg_loss = total_loss / num_mask_tokens
+        perplexity = math.exp(avg_loss)
+    return perplexity
+def calculate_cosine_sim(original_sequence, generated_sequence, tokenizer, esm_model, device):
+    og_embeddings = get_latents(esm_model, tokenizer, original_sequence.upper(), device)
+    new_embeddings = get_latents(esm_model, tokenizer, generated_sequence, device)
+    sequence_similarity = torch.nn.functional.cosine_similarity(og_embeddings, new_embeddings, dim=-1)
+    cosine_similarity = torch.mean(sequence_similarity).item()
+    return cosine_similarity
+def calculate_hamming_dist(original_sequence, generated_sequence):
+    generated_sequence = generated_sequence.upper()
+    original_sequence = original_sequence.upper()
+    return sum(1 if original_sequence[i] != generated_sequence[i] else 0 for i in range(len(original_sequence)))

noise_schedule.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import abc
+import torch
+import torch.nn as nn
+# Flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+def get_noise(config, dtype=torch.float32):
+  return LogLinearNoise()
+  if config.noise.type == 'geometric':
+    return GeometricNoise(config.noise.sigma_min,
+                          config.noise.sigma_max)
+  elif config.noise.type == 'loglinear':
+    return LogLinearNoise()
+  elif config.noise.type == 'cosine':
+    return CosineNoise()
+  elif config.noise.type == 'cosinesqr':
+    return CosineSqrNoise()
+  elif config.noise.type == 'linear':
+    return Linear(config.noise.sigma_min,
+                  config.noise.sigma_max,
+                  dtype)
+  else:
+    raise ValueError(f'{config.noise.type} is not a valid noise')
+def binary_discretization(z):
+  z_hard = torch.sign(z)
+  z_soft = z / torch.norm(z, dim=-1, keepdim=True)
+  return z_soft + (z_hard - z_soft).detach()
+class Noise(abc.ABC, nn.Module):
+  """
+  Baseline forward method to get the total + rate of noise at a timestep
+  """
+  def forward(self, t):
+    # Assume time goes from 0 to 1
+    return self.total_noise(t), self.rate_noise(t)
+  @abc.abstractmethod
+  def rate_noise(self, t):
+    """
+    Rate of change of noise ie g(t)
+    """
+    pass
+  @abc.abstractmethod
+  def total_noise(self, t):
+    """
+    Total noise ie \int_0^t g(t) dt + g(0)
+    """
+    pass
+class CosineNoise(Noise):
+  def __init__(self, eps=1e-3):
+    super().__init__()
+    self.eps = eps
+  def rate_noise(self, t):
+    cos = (1 - self.eps) * torch.cos(t * torch.pi / 2)
+    sin = (1 - self.eps) * torch.sin(t * torch.pi / 2)
+    scale = torch.pi / 2
+    return scale * sin / (cos + self.eps)
+  def total_noise(self, t):
+    cos = torch.cos(t * torch.pi / 2)
+    return - torch.log(self.eps + (1 - self.eps) * cos)
+class CosineSqrNoise(Noise):
+  def __init__(self, eps=1e-3):
+    super().__init__()
+    self.eps = eps
+  def rate_noise(self, t):
+    cos = (1 - self.eps) * (
+      torch.cos(t * torch.pi / 2) ** 2)
+    sin = (1 - self.eps) * torch.sin(t * torch.pi)
+    scale = torch.pi / 2
+    return scale * sin / (cos + self.eps)
+  def total_noise(self, t):
+    cos = torch.cos(t * torch.pi / 2) ** 2
+    return - torch.log(self.eps + (1 - self.eps) * cos)
+class Linear(Noise):
+  def __init__(self, sigma_min=0, sigma_max=10, dtype=torch.float32):
+    super().__init__()
+    self.sigma_min = torch.tensor(sigma_min, dtype=dtype)
+    self.sigma_max = torch.tensor(sigma_max, dtype=dtype)
+  def rate_noise(self, t):
+    return self.sigma_max - self.sigma_min
+  def total_noise(self, t):
+    return self.sigma_min + t * (self.sigma_max - self.sigma_min)
+  def importance_sampling_transformation(self, t):
+    f_T = torch.log1p(- torch.exp(- self.sigma_max))
+    f_0 = torch.log1p(- torch.exp(- self.sigma_min))
+    sigma_t = - torch.log1p(- torch.exp(t * f_T + (1 - t) * f_0))
+    return (sigma_t - self.sigma_min) / (
+      self.sigma_max - self.sigma_min)
+class GeometricNoise(Noise):
+  def __init__(self, sigma_min=1e-3, sigma_max=1):
+    super().__init__()
+    self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max])
+  def rate_noise(self, t):
+    return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (
+      self.sigmas[1].log() - self.sigmas[0].log())
+  def total_noise(self, t):
+    return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
+class LogLinearNoise(Noise):
+  """Log Linear noise schedule.
+  Built such that 1 - 1/e^(n(t)) interpolates between 0 and
+  ~1 when t varies from 0 to 1. Total noise is
+  -log(1 - (1 - eps) * t), so the sigma will be
+  (1 - eps) * t.
+  """
+  def __init__(self, eps=1e-3):
+    super().__init__()
+    self.eps = eps
+    self.sigma_max = self.total_noise(torch.tensor(1.0))
+    self.sigma_min = self.eps + self.total_noise(torch.tensor(0.0))
+  def rate_noise(self, t):
+    return (1 - self.eps) / (1 - (1 - self.eps) * t)
+  def total_noise(self, t):
+    return -torch.log1p(-(1 - self.eps) * t)
+  def importance_sampling_transformation(self, t):
+    f_T = torch.log1p(- torch.exp(- self.sigma_max))
+    f_0 = torch.log1p(- torch.exp(- self.sigma_min))
+    sigma_t = - torch.log1p(- torch.exp(t * f_T + (1 - t) * f_0))
+    t = - torch.expm1(- sigma_t) / (1 - self.eps)
+    return t

pl_data_loader.py ADDED Viewed

	@@ -0,0 +1,819 @@

+import functools
+import itertools
+import json
+import math
+import os
+import re
+import shutil
+import typing
+import urllib
+import zipfile
+import datasets
+import fsspec
+import requests
+import tokenizers
+import torch
+import transformers
+import utils
+LOGGER = utils.get_logger(__name__)
+def wt_detokenizer(string):
+  # contractions
+  string = string.replace("s '", "s'")
+  string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+  # number separators
+  string = string.replace(" @-@ ", "-")
+  string = string.replace(" @,@ ", ",")
+  string = string.replace(" @.@ ", ".")
+  # punctuation
+  string = string.replace(" : ", ": ")
+  string = string.replace(" ; ", "; ")
+  string = string.replace(" . ", ". ")
+  string = string.replace(" ! ", "! ")
+  string = string.replace(" ? ", "? ")
+  string = string.replace(" , ", ", ")
+  # double brackets
+  string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
+  string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
+  string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
+  string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+  string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+  # miscellaneous
+  string = string.replace("= = = =", "====")
+  string = string.replace("= = =", "===")
+  string = string.replace("= =", "==")
+  string = string.replace(" " + chr(176) + " ", chr(176))
+  string = string.replace(" \n", "\n")
+  string = string.replace("\n ", "\n")
+  string = string.replace(" N ", " 1 ")
+  string = string.replace(" 's", "'s")
+  return string
+def ptb_detokenizer(x):
+  x = x.replace(" 's", "'s")
+  x = x.replace("s ' ", "s' ")
+  x = x.replace(" n't", "n't")
+  x = x.replace(" \n ", "\n")
+  x = x.replace("\\/", "/")
+  for _ in range(10):
+      x = x.replace(" N ", " 1 ")
+  x = x.replace("$ 1", "$1")
+  x = x.replace("# 1", "#1")
+  x = x.replace("<unk>", "?")
+  return x
+def lm1b_detokenizer(x):
+  x = x.replace('http : / / ', 'http://')
+  x = x.replace('https : / / ', 'https://')
+  x = re.sub(r' \'(\w+)', r"'\1", x)
+  x = re.sub(r' (\w+) \. ', r' \1. ', x)
+  x = re.sub(r' (\w+) \.$', r' \1.', x)
+  x = x.replace(' ? ', '? ')
+  x = re.sub(r' \?$', '?', x)
+  x = x.replace(' ! ', '! ')
+  x = re.sub(r' \!$', '!', x)
+  x = x.replace(' , ', ', ')
+  x = x.replace(' : ', ': ')
+  x = x.replace(' ; ', '; ')
+  x = x.replace(' / ', '/')
+  x = re.sub(r'\" ([^\"]+) \"', r'"\1"', x)
+  x = re.sub(r'\' ([^\']+) \'', r"'\1'", x)
+  x = re.sub(r'\( ([^\(\)]+) \)', r"(\1)", x)
+  x = re.sub(r'\[ ([^\[\]]+) \]', r"[\1]", x)
+  x = x.replace('$ ', '$')
+  x = x.replace('£ ', '£')
+  return x
+def lambada_detokenizer(text):
+  text = text.replace("“", '"')
+  text = text.replace("”", '"')
+  return '\n'+text.strip()
+def scientific_papers_detokenizer(x):
+  x = wt_detokenizer(x)
+  x = lm1b_detokenizer(x)
+  return x
+class Text8Tokenizer(transformers.PreTrainedTokenizer):
+  def __init__(
+    self,
+    bos_token='[BOS]',
+    eos_token='[EOS]',
+    sep_token='[SEP]',
+    cls_token='[CLS]',
+    pad_token='[PAD]',
+    mask_token='[MASK]',
+    unk_token='[UNK]',
+    **kwargs):
+    self.characters = list('abcdefghijklmnopqrstuvwxyz ')
+    self._vocab_str_to_int = {
+      '[CLS]': 0,
+      '[SEP]': 1,
+      '[BOS]': 2,
+      '[EOS]': 3,
+      '[MASK]': 4,
+      '[PAD]': 5,
+      '[RESERVED]': 6,
+      '[UNK]': 7,
+      ** {ch: i + 8 for i, ch in enumerate(self.characters)}}
+    self._vocab_int_to_str = {
+      v: k for k, v in self._vocab_str_to_int.items()}
+    super().__init__(
+      bos_token=bos_token,
+      eos_token=eos_token,
+      sep_token=sep_token,
+      cls_token=cls_token,
+      pad_token=pad_token,
+      mask_token=mask_token,
+      unk_token=unk_token,
+      **kwargs)
+  @property
+  def vocab_size(self) -> int:
+    return len(self._vocab_str_to_int)
+  def _tokenize(self, text: str, **kwargs) -> typing.List[str]:
+    return list(text.lower())
+  def _convert_token_to_id(self, token: str) -> int:
+    return self._vocab_str_to_int.get(
+      token, self._vocab_str_to_int['[UNK]'])
+  def _convert_id_to_token(self, index: int) -> str:
+    return self._vocab_int_to_str[index]
+  def convert_tokens_to_string(self, tokens):
+    return ''.join(tokens)
+  def get_vocab(self) -> typing.Dict[str, int]:
+    return self._vocab_str_to_int
+def get_lambada_test_dataset():
+    url = "https://openaipublic.blob.core.windows.net/gpt-2/data/lambada_test.jsonl"
+    def read_jsonl_to_list(url):
+      response = requests.get(url, stream=True)
+      data_list = []
+      # Process each line in the response content
+      for line in response.iter_lines(decode_unicode=True):
+        if line:
+          data = json.loads(line)
+          data_list.append(data)
+      return data_list
+    lambada_data = read_jsonl_to_list(url)
+    dataset = datasets.Dataset.from_list(lambada_data)
+    return dataset
+def get_text8_dataset(cache_dir, max_seq_length=256,
+                      drop_last=True, crop_train=False):
+  """Adapted from:
+    https://github.com/google-research/google-research/blob/master/d3pm/text/datasets.py#L344
+    Args:
+      cache_dir: str, path to cache directory.
+      max_seq_length: int, maximum length of sequences.
+          (default: 256, as in D3PM codebase.)
+      drop_last: bool, whether to drop the last incomplete
+          batch. (default: True, as in D3PM codebase.)
+      crop_train: bool, whether to subsample contiguous
+          subsequences from training example. serves to
+          make sure transformer models with absolute position
+          embeddings do not have incorrect position-wise
+          marginals. (default: False, but necessary to match D3PM AR)
+    Returns:
+      dataset: dataset.DatasetDict, with keys 'train',
+          'valid', 'test'.
+  """
+  url = 'http://mattmahoney.net/dc/text8.zip'
+  if not crop_train:
+    cache_dir = f'{cache_dir}/text8'
+  else:
+    cache_dir = f'{cache_dir}/text8-crop-train'
+  split_names = ['train', 'validation', 'test']
+  if not all([
+    utils.fsspec_exists(os.path.join(cache_dir, split))
+    for split in split_names
+  ]):
+    # Check if raw data exists
+    raw_cache_dir = os.path.join(cache_dir, 'raw_data')
+    if not all([
+      utils.fsspec_exists(
+        os.path.join(raw_cache_dir, f'text8.{split}.txt'))
+      for split in split_names
+    ]):
+      if not utils.fsspec_exists(
+        os.path.join(raw_cache_dir, 'text8.zip')):
+        utils.fsspec_mkdirs(raw_cache_dir, exist_ok=True)
+        LOGGER.info('Downloading text8 from URL {}.'.format(url))
+        with urllib.request.urlopen(url) as in_stream:
+          with open(os.path.join(raw_cache_dir, 'text8.zip'), 'wb') as out_file:
+            shutil.copyfileobj(in_stream, out_file)
+      with fsspec.open(
+        os.path.join(raw_cache_dir, 'text8.zip'),
+        'rb') as f:
+        rawdata = zipfile.ZipFile(f).read(
+          'text8').decode('utf-8')
+      # Splits taken from D3PM codebase
+      splits = {
+        'train': rawdata[:90000000],
+        'validation': rawdata[90000000: 95000000],
+        'test': rawdata[95000000:],
+      }
+      for split, data in splits.items():
+        _path = os.path.join(raw_cache_dir,
+                             f'text8.{split}.txt')
+        with fsspec.open(_path, 'w') as f:
+          f.write(data)
+    else:
+      splits = {}
+      for split in split_names:
+        _path = os.path.join(raw_cache_dir,
+                             f'text8.{split}.txt')
+        with fsspec.open(_path, 'r') as f:
+          splits[split] = f.read()
+    # Chunk and save as datasets.DatasetDict
+    def chunks(lst, n):
+      """Yield successive n-sized chunks from lst."""
+      for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+    dataset_dict = {}
+    for k, v in splits.items():
+      if k == 'train' and crop_train == True:
+        chunk_size = 2 * max_seq_length
+      else:
+        chunk_size = max_seq_length
+      text = list(chunks(v, chunk_size))
+      if drop_last and len(text[-1]) < chunk_size:
+        text = text[:-1]
+      dataset_dict[k] = datasets.Dataset.from_dict({'text': text})
+    dataset = datasets.DatasetDict(dataset_dict)
+    dataset.save_to_disk(cache_dir)
+  else:
+    dataset = datasets.load_from_disk(cache_dir)
+  return dataset
+def _group_texts(examples, block_size, bos, eos):
+  # Concatenate all texts.
+  concatenated_examples = list(itertools.chain(* examples['input_ids']))
+  total_length = len(concatenated_examples)
+  # TODO(yair): look into not dropping the remainder but rather padding it.
+  # We drop the small remainder, and if the total_length < block_size - 2
+  # we exclude this batch and return an empty dict.
+  # We could add padding if the model supported it instead of
+  # this drop, you can customize this part to your needs.
+  new_block_size = block_size - 2  # [BOS] and [EOS] to be added
+  total_length = (total_length // new_block_size) * new_block_size
+  # Split by chunks of max_len.
+  result = {}
+  _values = []
+  _attn_masks = []
+  for i in range(0, total_length, new_block_size):
+    _values.append(
+      [bos]
+      + concatenated_examples[i : i + new_block_size]
+      + [eos])
+    _attn_masks.append(torch.ones(block_size))
+  result['input_ids'] = _values
+  result['attention_mask'] = _attn_masks
+  return result
+def get_dataset(
+    dataset_name, tokenizer, wrap, mode, cache_dir,
+    block_size=1024, num_proc=len(os.sched_getaffinity(0)), streaming=False):
+  if wrap:
+    filename = f'{dataset_name}_{mode}_bs{block_size}_wrapped.dat'
+  else:
+    filename = f'{dataset_name}_{mode}_bs{block_size}_unwrapped.dat'
+  _path = os.path.join(cache_dir, filename)
+  if utils.fsspec_exists(_path):
+    LOGGER.info(f'Loading data from: {_path}')
+    return datasets.load_from_disk(_path).with_format('torch')
+  LOGGER.info(f'Generating new data at: {_path}')
+  crop_train = dataset_name == 'text8-crop'
+  if mode == 'train' and crop_train:
+    # double block size for sub-sampling
+    block_size *= 2
+  if dataset_name == 'wikitext103':
+    dataset = datasets.load_dataset(
+      'wikitext',
+      name='wikitext-103-raw-v1',
+      cache_dir=cache_dir)
+  elif dataset_name == 'wikitext2':
+    dataset = datasets.load_dataset(
+      'wikitext',
+      name='wikitext-2-raw-v1',
+      cache_dir=cache_dir)
+  elif dataset_name == 'ptb':
+    dataset = datasets.load_dataset(
+      'ptb_text_only', cache_dir=cache_dir)
+  elif dataset_name == 'lambada':
+    dataset = get_lambada_test_dataset()
+  elif dataset_name == 'text8':
+    assert wrap
+    dataset = get_text8_dataset(
+      cache_dir, max_seq_length=block_size)
+  elif dataset_name == 'text8-crop':
+    dataset = get_text8_dataset(
+      cache_dir, max_seq_length=block_size, crop_train=True)
+  elif dataset_name == 'openwebtext-train':
+    dataset = datasets.load_dataset(
+      'openwebtext',
+      split='train[:-100000]',
+      cache_dir=cache_dir,
+      streaming=streaming)
+  elif dataset_name == 'openwebtext-valid':
+    dataset = datasets.load_dataset(
+      'openwebtext',
+      split='train[-100000:]',
+      cache_dir=cache_dir,
+      streaming=streaming)
+  elif dataset_name == 'scientific_papers_arxiv':
+    dataset = datasets.load_dataset(
+      'scientific_papers', 'arxiv',
+      trust_remote_code=True,
+      cache_dir=cache_dir,
+      streaming=streaming)
+  elif dataset_name == 'scientific_papers_pubmed':
+    dataset = datasets.load_dataset(
+      'scientific_papers', 'pubmed',
+      trust_remote_code=True,
+      cache_dir=cache_dir,
+      streaming=streaming)
+  elif dataset_name == 'ag_news':
+    dataset = datasets.load_dataset(
+      'ag_news',
+      cache_dir=cache_dir,
+      streaming=streaming)
+  else:
+    dataset = datasets.load_dataset(
+      dataset_name,
+      cache_dir=cache_dir,
+      streaming=streaming)
+  if dataset_name in ['lambada', 'openwebtext-train',
+                      'openwebtext-valid']:
+    data = dataset
+  else:
+    data = dataset[mode]
+  if dataset_name.startswith('wikitext'):
+    detokenizer = wt_detokenizer
+  elif dataset_name == 'ptb':
+    detokenizer = ptb_detokenizer
+  elif dataset_name == 'lm1b':
+    detokenizer = lm1b_detokenizer
+  elif dataset_name == 'lambada':
+    detokenizer = lambada_detokenizer
+  elif dataset_name.startswith('scientific_papers'):
+    detokenizer = scientific_papers_detokenizer
+  else:
+    detokenizer = None
+  def _apply_detokenizer(detokenizer):
+    def detok(text):
+      for i, t in enumerate(text, 0):
+        text[i] = detokenizer(t)
+      return text
+    return detok
+  EOS = tokenizer.encode(tokenizer.eos_token)[0]
+  BOS = tokenizer.encode(tokenizer.bos_token)[0]
+  def preprocess_and_tokenize(example):
+    if dataset_name == 'ptb':
+      text = example['sentence']
+    elif 'scientific_papers' in dataset_name:
+      text = example['article']
+    else:
+      text = example['text']
+    if detokenizer is not None:
+      text = _apply_detokenizer(detokenizer)(text)
+    tokenizer.padding_side = 'right'
+    tokenizer.truncation_side = 'right'
+    if wrap:
+      tokens = tokenizer(text,
+                         add_special_tokens=False,
+                         return_attention_mask=False,
+                         return_token_type_ids=False)
+      tokens = {'input_ids':
+                [t + [EOS] for t in tokens['input_ids']]}
+      # Still missing BOS, but will be added in group_texts
+    else:
+      tokens = tokenizer(text,
+                         max_length=block_size,
+                         padding='max_length',
+                         truncation=True,
+                         add_special_tokens=True,
+                         return_attention_mask=True,
+                         return_token_type_ids=True)
+    return tokens
+  if streaming:
+    tokenized_dataset = data.map(
+      preprocess_and_tokenize,
+      batched=True,
+      desc='Tokenizing')
+  else:
+    tokenized_dataset = data.map(
+      preprocess_and_tokenize,
+      batched=True,
+      num_proc=num_proc,
+      load_from_cache_file=True,
+      desc='Tokenizing')
+  if dataset_name == 'ptb':
+    tokenized_dataset = tokenized_dataset.remove_columns(
+      'sentence')
+  elif 'scientific_papers' in dataset_name:
+    tokenized_dataset = tokenized_dataset.remove_columns([
+      'article', 'abstract', 'section_names'])
+  elif dataset_name == 'ag_news':
+    tokenized_dataset = tokenized_dataset.remove_columns(
+      ['text', 'label'])
+  else:
+    tokenized_dataset = tokenized_dataset.remove_columns(
+      'text')
+  if not wrap:
+    tokenized_dataset.save_to_disk(_path)
+    return tokenized_dataset.with_format('torch')
+  group_texts = functools.partial(
+    _group_texts, block_size=block_size, bos=BOS, eos=EOS)
+  if streaming:
+    chunked_dataset = tokenized_dataset.map(
+      group_texts,
+      batched=True,
+      desc='Grouping')
+  else:
+    chunked_dataset = tokenized_dataset.map(
+      group_texts,
+      batched=True,
+      num_proc=num_proc,
+      load_from_cache_file=True,
+      desc='Grouping')
+    chunked_dataset.save_to_disk(_path)
+  chunked_dataset = chunked_dataset.with_format('torch')
+  return chunked_dataset
+def get_tokenizer(config):
+  if config.data.tokenizer_name_or_path == 'text8':
+    tokenizer = Text8Tokenizer()
+  elif config.data.tokenizer_name_or_path == 'bert-base-uncased':
+    tokenizer = transformers.BertTokenizer.\
+      from_pretrained('bert-base-uncased')
+  else:
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+      config.data.tokenizer_name_or_path)
+  if (isinstance(tokenizer, transformers.GPT2TokenizerFast)
+      or isinstance(tokenizer, transformers.GPT2Tokenizer)):
+    tokenizer._tokenizer.post_processor = tokenizers.processors.BertProcessing(
+      (tokenizer.bos_token, tokenizer.bos_token_id),
+      (tokenizer.eos_token, tokenizer.eos_token_id))
+  # For wrapped batches:
+  #  [BOS] sent1 [EOS] sent2-fragment [EOS]
+  #  [BOS] sent2-fragment [EOS] sent3 [EOS]
+  if tokenizer.bos_token is None:
+    if tokenizer.cls_token is None:
+      raise AttributeError(
+        'Tokenizer must have a bos_token or '
+        f'cls_token: {tokenizer}')
+    tokenizer.bos_token = tokenizer.cls_token
+  if tokenizer.eos_token is None:
+    if tokenizer.sep_token is None:
+      raise AttributeError(
+        'Tokenizer must have a eos_token '
+        f'or sep_token: {tokenizer}')
+    tokenizer.eos_token = tokenizer.sep_token
+  if tokenizer.pad_token is None:
+    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+  return tokenizer
+def get_dataloaders(config, tokenizer, skip_train=False,
+                    skip_valid=False, valid_seed=None):
+  num_gpus = torch.cuda.device_count()
+  assert (config.loader.global_batch_size
+          == (config.loader.batch_size
+              * config.trainer.num_nodes
+              * num_gpus
+              * config.trainer.accumulate_grad_batches))
+  if config.loader.global_batch_size % (
+    num_gpus * config.trainer.accumulate_grad_batches) != 0:
+    raise ValueError(
+      f'Train Batch Size {config.training.batch_size}'
+      f'not divisible by {num_gpus} gpus with accumulation '
+      f'{config.trainer.accumulate_grad_batches}.')
+  if config.loader.eval_global_batch_size % num_gpus != 0:
+    raise ValueError(
+      f'Eval Batch Size for {config.eval.batch_size} '
+      f'not divisible by {num_gpus}.')
+  if skip_train:
+    train_set = None
+  else:
+    train_set = get_dataset(
+      config.data.train,
+      tokenizer,
+      mode='train',
+      wrap=config.data.wrap,
+      #cache_dir=config.data.cache_dir,
+      block_size=config.model.length)
+  if config.data.valid in ['text8', 'lm1b', 'ag_news']:
+    validation_split = 'test'
+  else:
+    validation_split = 'validation'
+  if skip_valid:
+    valid_set = None
+  else:
+    valid_set = get_dataset(
+      config.data.valid,
+      tokenizer,
+      wrap=config.data.wrap,
+      mode=validation_split,
+      #cache_dir=config.data.cache_dir,
+      block_size=config.model.length,
+      streaming=False)
+  if skip_train:
+    train_loader = None
+  else:
+    train_loader = torch.utils.data.DataLoader(
+      train_set,
+      batch_size=config.loader.batch_size,
+      num_workers=config.loader.num_workers,
+      pin_memory=config.loader.pin_memory,
+      shuffle=not config.data.streaming,
+      persistent_workers=True)
+    train_loader.tokenizer = tokenizer
+  if skip_valid:
+    valid_loader = None
+  else:
+    if valid_seed is None:
+      shuffle_valid = False
+      generator = None
+    else:
+      shuffle_valid = True
+      generator = torch.Generator().manual_seed(valid_seed)
+    valid_loader = torch.utils.data.DataLoader(
+      valid_set,
+      batch_size=config.loader.eval_batch_size,
+      num_workers=config.loader.num_workers,
+      pin_memory=config.loader.pin_memory,
+      shuffle=shuffle_valid,
+      generator=generator)
+    # Will be used in generative perplexity calculation
+    valid_loader.tokenizer = tokenizer
+  return train_loader, valid_loader
+# Samplers adapted from: https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/fault_tolerant_sampler.py
+class RandomFaultTolerantSampler(torch.utils.data.RandomSampler):
+  def __init__(self, *args, generator=None, **kwargs):
+    # TD [2022-07-17]: We don't force the seed to be zero. We generate random seed,
+    # which should be reproducible if pl.seed_everything was called beforehand.
+    # This means that changing the seed of the experiment will also change the
+    # sampling order.
+    if generator is None:
+      seed = int(torch.empty((), dtype=torch.int64).random_().item())
+      generator = torch.Generator().manual_seed(seed)
+    kwargs.pop('shuffle', None)
+    super().__init__(*args, generator=generator, **kwargs)
+    self.counter = 0
+    self.restarting = False
+  def state_dict(self):
+    return {'random_state': self.generator.get_state(),
+            'counter': self.counter}
+  def load_state_dict(self, state_dict):
+    self.generator.set_state(state_dict.get('random_state'))
+    self.counter = state_dict['counter']
+    # self.start_counter = self.counter
+    self.restarting = True
+  # TD [2022-08-28] Setting the len will cause PL to think there are only a few batches left per
+  # epoch, and subsequent epoch will have very few batches.
+  def __iter__(self) -> typing.Iterator[int]:
+    n = len(self.data_source)
+    self.state = self.generator.get_state()
+    indices = torch.randperm(n, generator=self.generator).tolist()
+    if not self.restarting:
+      self.counter = 0
+    else:
+      indices = indices[self.counter:]
+      self.restarting = False
+    for index in indices:
+      self.counter += 1
+      yield index
+    self.counter = 0
+class FaultTolerantDistributedSampler(torch.utils.data.DistributedSampler):
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self.counter = 0
+    self.restarting = False
+  def state_dict(self):
+    return {'epoch': self.epoch, 'counter': self.counter}
+  def load_state_dict(self, state_dict):
+    self.epoch = state_dict['epoch']
+    self.counter = state_dict['counter']
+    self.restarting = True
+  # TD [2022-08-28] Setting the len will cause PL to think there are only a few batches left per
+  # epoch, and subsequent epoch will have very few batches.
+  def __iter__(self):
+    if self.shuffle:
+      # deterministically shuffle based on epoch and seed
+      g = torch.Generator()
+      g.manual_seed(self.seed + self.epoch)
+      indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
+    else:
+      indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
+    if not self.drop_last:
+      # add extra samples to make it evenly divisible
+      padding_size = self.total_size - len(indices)
+      if padding_size <= len(indices):
+        indices += indices[:padding_size]
+      else:
+        indices += (indices * math.ceil(
+          padding_size / len(indices)))[:padding_size]
+    else:
+      # remove tail of data to make it evenly divisible.
+      indices = indices[:self.total_size]
+    assert len(indices) == self.total_size
+    # subsample
+    indices = indices[self.rank:self.total_size:self.num_replicas]
+    assert len(indices) == self.num_samples
+    if not self.restarting:
+      self.counter = 0
+    else:
+      indices = indices[self.counter:]
+      self.restarting = False
+    for index in indices:
+      self.counter += 1
+      yield index
+    self.counter = 0
+from torch.utils.data import Dataset, DataLoader
+import lightning.pytorch as pl
+from functools import partial
+import sys
+class CustomDataset(torch.utils.data.Dataset):
+  def __init__(self, dataset, indices):
+    self.dataset = dataset
+    self.indices = indices
+  def __len__(self):
+    return len(self.indices)
+  def __getitem__(self, idx):
+    actual_idx = int(self.indices[idx])
+    item = self.dataset[actual_idx]
+    return item
+def membrane_collate_fn(batch, tokenizer):
+    """Custom data collator that masks TM/soluble residues for focused training"""
+    MAX_LENGTH = 1024
+    sequences = [item['Sequence'].upper() for item in batch]
+    masks = []
+    for item in batch:
+      if item["Label"] == 0:
+        mask = [1 if i.isupper() else 0 for i in item["Sequence"]]
+      else:
+        mask = [0 if i.isupper() else 1 for i in item["Sequence"]]
+      mask = [1] + mask
+      if len(mask) > MAX_LENGTH: # Truncate
+        mask = mask[:MAX_LENGTH]
+      elif len(mask) < MAX_LENGTH: # Pad
+        mask += [1] * (MAX_LENGTH - len(mask))
+      masks.append(torch.as_tensor(mask))
+    mask_t = torch.stack(masks, dim=0)
+    tokens = tokenizer(sequences, return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LENGTH)
+    return {
+        'input_ids': tokens['input_ids'],
+        'attention_mask': tokens['attention_mask'],
+        'mask': mask_t
+    }
+def wrap_collate_fn(batch, tokenizer):
+  """Standard data collator that wraps sequences over padding them"""
+  # Define sequence size
+  chunk_size = 1024
+  eos_placeholder = "k"
+  eos = "<eos>"
+  # Wrap sequences by collecting and splitting them into chunks
+  # From MDLM paper: insert <eos> at start/end of chunks and in between sequences
+  sequences = eos_placeholder.join([item['Sequence'].upper() for item in batch])
+  sequences = eos_placeholder + sequences + eos_placeholder
+  wrapped_sequences = [sequences[i:i+chunk_size] for i in range(0, len(sequences), chunk_size)]
+  for idx, seq in enumerate(wrapped_sequences):
+    wrapped_sequences[idx] = seq.replace(eos_placeholder, eos)
+  # Tokenize for input ids and attention masks
+  tokens = tokenizer(wrapped_sequences, return_tensors='pt', padding=True)
+  return {
+    "input_ids": tokens['input_ids'],
+    "attention_mask": tokens['attention_mask']
+  }
+def collate_fn(batch, tokenizer):
+    """Standard data collator that truncates/pad sequences based on max_length"""
+    sequences = [item['Sequence'].upper() for item in batch]
+    max_len = max([len(seq) for seq in sequences])
+    #labels = torch.tensor([item['labels'] for item in batch], dtype=torch.float32)
+    tokens = tokenizer(sequences, return_tensors='pt', padding='max_length', truncation=True, max_length=1024)
+    #attention_masks = torch.ones(tokens.size()[:2], dtype=torch.bool)
+    return {
+        'input_ids': tokens['input_ids'],
+        'attention_mask': tokens['attention_mask']
+    }
+class CustomDataModule(pl.LightningDataModule):
+    def __init__(self, train_dataset, val_dataset, test_dataset, tokenizer, batch_size: int=8, collate_fn=collate_fn):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.test_dataset = test_dataset
+        self.batch_size = batch_size
+        self.tokenizer = tokenizer
+        self.collate_fn = collate_fn
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset, batch_size=self.batch_size,
+                          collate_fn=partial(self.collate_fn, tokenizer=self.tokenizer),
+                          num_workers=8, pin_memory=True)
+    def val_dataloader(self):
+        return DataLoader(self.val_dataset, batch_size=self.batch_size,
+                          collate_fn=partial(self.collate_fn, tokenizer=self.tokenizer),
+                          num_workers=8, pin_memory=True)
+    def test_dataloader(self):
+        return DataLoader(self.test_dataset, batch_size=self.batch_size,
+                          collate_fn=partial(self.collate_fn, tokenizer=self.tokenizer),
+                          num_workers=8, pin_memory=True)

utils.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""Console logger utilities.
+Copied from https://github.com/HazyResearch/transformers/blob/master/src/utils/utils.py
+Copied from https://docs.python.org/3/howto/logging-cookbook.html#using-a-context-manager-for-selective-logging
+"""
+import logging
+import math
+import fsspec
+import lightning
+import torch
+from timm.scheduler import CosineLRScheduler
+def fsspec_exists(filename):
+  """Check if a file exists using fsspec."""
+  fs, _ = fsspec.core.url_to_fs(filename)
+  return fs.exists(filename)
+def fsspec_listdir(dirname):
+  """Listdir in manner compatible with fsspec."""
+  fs, _ = fsspec.core.url_to_fs(dirname)
+  return fs.ls(dirname)
+def fsspec_mkdirs(dirname, exist_ok=True):
+  """Mkdirs in manner compatible with fsspec."""
+  fs, _ = fsspec.core.url_to_fs(dirname)
+  fs.makedirs(dirname, exist_ok=exist_ok)
+def print_nans(tensor, name):
+  if torch.isnan(tensor).any():
+    print(name, tensor)
+class CosineDecayWarmupLRScheduler(
+  CosineLRScheduler,
+  torch.optim.lr_scheduler._LRScheduler):
+  """Wrap timm.scheduler.CosineLRScheduler
+  Enables calling scheduler.step() without passing in epoch.
+  Supports resuming as well.
+  Adapted from:
+    https://github.com/HazyResearch/hyena-dna/blob/main/src/utils/optim/schedulers.py
+  """
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._last_epoch = -1
+    self.step(epoch=0)
+  def step(self, epoch=None):
+    if epoch is None:
+      self._last_epoch += 1
+    else:
+      self._last_epoch = epoch
+    # We call either step or step_update, depending on
+    # whether we're using the scheduler every epoch or every
+    # step.
+    # Otherwise, lightning will always call step (i.e.,
+    # meant for each epoch), and if we set scheduler
+    # interval to "step", then the learning rate update will
+    # be wrong.
+    if self.t_in_epochs:
+      super().step(epoch=self._last_epoch)
+    else:
+      super().step_update(num_updates=self._last_epoch)
+class LoggingContext:
+  """Context manager for selective logging."""
+  def __init__(self, logger, level=None, handler=None, close=True):
+    self.logger = logger
+    self.level = level
+    self.handler = handler
+    self.close = close
+  def __enter__(self):
+    if self.level is not None:
+      self.old_level = self.logger.level
+      self.logger.setLevel(self.level)
+    if self.handler:
+      self.logger.addHandler(self.handler)
+  def __exit__(self, et, ev, tb):
+    if self.level is not None:
+      self.logger.setLevel(self.old_level)
+    if self.handler:
+      self.logger.removeHandler(self.handler)
+    if self.handler and self.close:
+      self.handler.close()
+def get_logger(name=__name__, level=logging.INFO) -> logging.Logger:
+  """Initializes multi-GPU-friendly python logger."""
+  logger = logging.getLogger(name)
+  logger.setLevel(level)
+  # this ensures all logging levels get marked with the rank zero decorator
+  # otherwise logs would get multiplied for each GPU process in multi-GPU setup
+  for level in ('debug', 'info', 'warning', 'error',
+                'exception', 'fatal', 'critical'):
+    setattr(logger,
+            level,
+            lightning.pytorch.utilities.rank_zero_only(
+              getattr(logger, level)))
+  return logger
+class Sampler:
+  def __init__(self, shape):
+    self.shape = shape
+  def _sampling_noise(self):
+    pass
+  def _hard_sample(self, logits):
+    pass
+  def _soft_sample(self, logits):
+    return 0
+  def sample(self, logits):
+    noise = self._sampling_noise()
+    noise = noise[: logits.shape[0], :]
+    logits = logits + noise.to(
+      dtype=logits.dtype, device=logits.device)
+    hard_sample = self._hard_sample(logits)
+    soft_sample = self._soft_sample(logits)
+    return soft_sample + (hard_sample - soft_sample).detach()
+class TopKSampler(Sampler):
+  def __init__(self, k, shape, gamma_tau=1.0):
+    super().__init__(shape)
+    self.k = k
+    self.gamma_tau = gamma_tau
+    self.num_betas = 10
+    self.sampler = torch.distributions.gamma.Gamma(
+      1 / k * torch.ones(self.num_betas, * self.shape), 1.0)
+  def _sampling_noise(self):
+    noise = self.sampler.sample()
+    beta = self.k / torch.arange(1, self.num_betas + 1, 1,
+                                 dtype=torch.float32)
+    beta = beta[:, None, None]
+    assert beta.ndim == noise.ndim
+    s = noise / beta
+    s = torch.sum(s, axis=0)
+    s = s - math.log(10.0)
+    s = self.gamma_tau * (s / self.k)
+    return s
+  def _hard_sample(self, logits):
+    assert logits.ndim == 2
+    thresholds, _ = torch.sort(logits, dim=-1)
+    thresholds = thresholds[:, - self.k][:, None]
+    return (logits >= thresholds).type(logits.dtype)
+  def _soft_sample(self, logits):
+    soft_top_k = logits - torch.mean(logits, dim=-1,
+                                     keepdim=True)
+    return soft_top_k / torch.norm(soft_top_k, dim=-1,
+                                   keepdim=True)
+class DeterministicTopK(TopKSampler):
+  def __init__(self, k):
+    super().__init__(k, shape=(1, 1))
+  def _sampling_noise(self):
+    return 0
+  def discreize(self, x):
+    hard_sample = self._hard_sample(x)
+    soft_sample = self._soft_sample(x)
+    return soft_sample + (hard_sample - soft_sample).detach()
+class GumbelSampler(Sampler):
+  def __init__(self, shape, temperature=1.0):
+    super().__init__(shape)
+    self.temperature = temperature
+  def _sampling_noise(self):
+    return - (1e-10 - (
+      torch.rand(* self.shape) + 1e-10).log()).log()
+  def _hard_sample(self, logits):
+    assert logits.ndim == 2
+    indices = torch.argmax(logits, dim=-1)
+    zeros = logits * 0
+    ones = torch.ones_like(logits[:, :, :1])
+    return torch.scatter(zeros, -1, indices[:, :, None],
+                         ones)
+  def _soft_sample(self, logits):
+    return torch.nn.functional.softmax(
+      logits / self.temperature, dim=-1)
+class BinarySampler(GumbelSampler):
+  def sample(self, probs):
+    # TODO(subhamsahoo): use the temperature parameter.
+    pos_noise = self._sampling_noise().to(
+      dtype=probs.dtype, device=probs.device)
+    neg_noise = self._sampling_noise().to(
+      dtype=probs.dtype, device=probs.device)
+    del_noise_exp = (neg_noise - pos_noise).exp()
+    hard_sample = (probs * (1 + del_noise_exp)
+                   > 1).to(probs.dtype)
+    soft_sample = probs / (probs + (1 - probs) * del_noise_exp)
+    return soft_sample + (hard_sample - soft_sample).detach()
+class GaussianSampler:
+  def __init__(self):
+    self.softplus = torch.nn.Softplus()
+  def sample(self, x):
+    assert x.ndim == 2
+    n = x.shape[-1] // 2
+    mu = x[:, :n]
+    sigma = self.softplus(x[:, n:]).sqrt()
+    return mu + sigma * torch.randn_like(mu)