Delete scripts

Browse files

Files changed (4) hide show

scripts/diffusion.py +0 -295
scripts/generate.py +0 -131
scripts/test.py +0 -17
scripts/train.py +0 -50

scripts/diffusion.py DELETED Viewed

@@ -1,295 +0,0 @@
-import itertools
-import math
-import torch
-import torch.nn as nn
-import numpy as np
-import pytorch_lightning as L
-import torchmetrics
-from dataclasses import dataclass
-from esm_utils import load_esm2_model
-from transformers import AutoModel, AutoTokenizer
-import dit, ema
-import sys
-import config
-import wandb
-import noise_schedule  # Assuming this is part of the MDLM repository
-wandb_key = "2b76a2fa2c1cdfddc5f443602c17b011fefb0a8f"
-wandb.login(key=wandb_key)
-wandb.init(project=config.Wandb.PROJECT, group=config.Wandb.GROUP)
-LOG2 = math.log(2)
-# Goal is to build an MDLM head on the BERT-style ESM model
-# Wrap the ESM model to obtain embeddings and ignore sigma to work with MDLM codebase
-class WrapESM(nn.Module):
-    def __init__(self, esm_model_path):
-        super(WrapESM, self).__init__()
-        self.esm_tokenizer, self.esm_model, _ = load_esm2_model(esm_model_path)
-        ### Only fine-tune the last 3 layers of ESM
-        # Count number of encoder layers
-        model_layers = len(self.esm_model.esm.encoder.layer)
-        # Disable parameter updates for all layers
-        for param in self.esm_model.parameters():
-            param.requires_grad = False
-        # Now that all parameters are disabled, only enable updates for the last 3 layers
-        for i, layer in enumerate(self.esm_model.esm.encoder.layer):
-            if i >= model_layers-config.ESM_LAYERS:
-                for module in layer.attention.self.key.modules():
-                    for param in module.parameters():
-                        param.requires_grad = True
-                for module in layer.attention.self.query.modules():
-                    for param in module.parameters():
-                        param.requires_grad = True
-                for module in layer.attention.self.value.modules():
-                    for param in module.parameters():
-                        param.requires_grad = True
-    def forward(self, latents, sigma):
-        return latents
-@dataclass
-class Loss:
-    loss: torch.FloatTensor
-    nlls: torch.FloatTensor
-    token_mask: torch.FloatTensor
-class NLL(torchmetrics.MeanMetric):
-    pass
-class BPD(NLL):
-    def compute(self) -> torch.Tensor:
-        """Computes the bits per dimension.
-        Returns:
-          bpd
-        """
-        return self.mean_value / self.weight / LOG2
-class Perplexity(NLL):
-    def compute(self) -> torch.Tensor:
-        """Computes the Perplexity.
-        Returns:
-         Perplexity
-        """
-        return torch.exp(self.mean_value / self.weight)
-# Based on MDLM repo
-class Diffusion(L.LightningModule):
-    def __init__(self, config, latent_dim, tokenizer):
-        super().__init__()
-        self.config = config
-        self.latent_dim = latent_dim
-        self.tokenizer = tokenizer
-        self.softplus = torch.nn.Softplus()
-        metrics = torchmetrics.MetricCollection({
-            'nll': NLL(),
-            'bpd': BPD(),
-            'ppl': Perplexity(),
-        })
-        metrics.set_dtype(torch.float64)
-        self.train_metrics = metrics.clone(prefix='train/')
-        self.valid_metrics = metrics.clone(prefix='val/')
-        self.test_metrics = metrics.clone(prefix='test/')
-        self.T = self.config.T
-        self.lr = self.config.Optim.LR
-        self.backbone = WrapESM(self.config.MODEL_NAME)
-        self.noise = noise_schedule.get_noise(self.config, dtype=self.dtype)
-        self.time_conditioning = self.config.TIME_CONDITIONING
-        self.subs_masking = self.config.SUBS_MASKING
-        self.mask_index = self.tokenizer.mask_token_id
-        self.antithetic_sampling = self.config.Training.ANTITHETIC_SAMPLING
-        self.sampling_eps = self.config.Training.SAMPLING_EPS
-        self.neg_infinity = -1000000.0
-    ############ FORWARD DIFFUSION #########
-    def subs_parameterization(self, logits, noised_latents):
-        print(logits.size()) # [bsz x bsz x seq_len]
-        logits = logits.float()
-        logits[:, :, self.mask_index] += self.neg_infinity
-        # Normalize the logits such that x.exp() is a probability distribution over vocab_size.
-        logits = logits - torch.logsumexp(logits, dim=-1, keepdim=True)
-        unmasked_indices = (noised_latents != self.mask_index)
-        logits[unmasked_indices] = self.neg_infinity
-        logits[~unmasked_indices] = 0
-        return logits
-        # # -inf probability of selecting a masked token
-        # unmasked_indices = (noised_latents != self.mask_index)
-        # logits[unmasked_indices] = self.neg_infinity
-        # # Carry over unmasked tokens
-        # bsz, seq_len, input_dim = logits.shape
-        # for batch_idx in range(bsz):
-        #     for residue in range(seq_len):
-        #         logits[batch_idx, residue, noised_latents[batch_idx, residue]] = 0
-        # return logits
-    def forward(self, latents, sigma):
-        latents = latents.long()
-        logits = self.backbone(latents, sigma)
-        optimized_logits = self.subs_parameterization(logits, latents)
-        return optimized_logits
-    def q_xt(self, latents, move_chance):
-        """
-        Computes the noisy sample xt.
-        Args:
-            x: int torch.Tensor with shape (batch_size, diffusion_model_input_length), input.
-            move_chance: float torch.Tensor with shape (batch_size, 1).
-        """
-        latents = torch.mean(latents, dim=2) # [bsz x seq_len x 1280] --> [bsz x seq_len] as per markdown
-        move_indices = torch.rand(* latents.shape, device=latents.device) < move_chance
-        noised_latents = torch.where(move_indices, self.mask_index, latents)
-        return noised_latents
-    def sample_timestep(self, n, device):
-        _eps_t = torch.rand(n, device=device)
-        if self.antithetic_sampling:
-            offset = torch.arange(n, device=device) / n
-            _eps_t = (_eps_t / n + offset) % 1
-        t = (1 - self.sampling_eps) * _eps_t + self.sampling_eps
-        # if self.importance_sampling:
-        #     return self.noise.importance_sampling_transformation(t)
-        return t
-    def forward_diffusion(self, x0):
-        """Forward diffusion process, adds noise to the latents."""
-        t = self.sample_timestep(x0.shape[0], x0.device)
-        sigma, dsigma = self.noise(t)
-        unet_conditioning = sigma[:, None]
-        move_chance = 1 - torch.exp(-sigma[:, None, None])
-        xt = self.q_xt(x0, move_chance)
-        model_output = self.forward(xt, unet_conditioning)
-        print(f'model out: {model_output}')
-        print(f'model out dim: {model_output.size()}') # [bsz x bsz x seq_len]
-        # SUBS parameterization, continuous time.
-        idx = torch.mean(x0, dim=2).long()[:, :, None]
-        print(f'idx: {idx}')
-        print(f'idx dim: {idx.size()}') # [bsz x seq_len x 1]
-        log_p_theta = torch.gather(input=model_output, dim=-1, index=idx).squeeze(-1)
-        scale = (dsigma / torch.expm1(sigma))[:, None]
-        return - log_p_theta * scale
-    ######### LOSS CALCULATIONS #########
-    def compute_loss(self, latents, attention_mask):
-        """"Average of MLM losses to stabilize training"""
-        loss = self.forward_diffusion(latents)
-        nlls = loss * attention_mask
-        count = attention_mask.sum()
-        batch_nll = nlls.sum()
-        token_nll = batch_nll / count
-        return Loss(loss=token_nll, nlls=nlls, token_mask=attention_mask)
-    ######### TRAINING #########
-    def training_step(self, batch):
-        latents, attention_mask = batch
-        loss = self.compute_loss(latents, attention_mask)
-        wandb.log({"train_loss": loss.loss.item()})
-        return loss.loss
-    def configure_optimizers(self):
-        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
-        return optimizer
-    def validation_step(self, batch):
-        latents, attention_mask = batch
-        loss = self.compute_loss(latents, attention_mask)
-        wandb.log({"val_loss": loss.loss.item()})
-        return loss.loss
-    ######### GENERATION #########
-    def sample_prior(self, *batch_dims):
-        return self.mask_index * torch.ones(* batch_dims, dtype=torch.int64)
-    def sample_categorical(categorical_probs):
-        gumbel_norm = (1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log())
-        return (categorical_probs / gumbel_norm).argmax(dim=-1)
-    def ddpm_caching_update(self, x, t, dt, p_x0=None):
-        assert self.config.noise.type == 'loglinear'
-        sigma_t, _ = self.noise(t)
-        if t.ndim > 1:
-            t = t.squeeze(-1)
-        assert t.ndim == 1
-        move_chance_t = t[:, None, None]
-        move_chance_s = (t - dt)[:, None, None]
-        assert move_chance_t.ndim == 3, move_chance_t.shape
-        if p_x0 is None:
-            p_x0 = self.forward(x, sigma_t).exp()
-        assert move_chance_t.ndim == p_x0.ndim
-        q_xs = p_x0 * (move_chance_t - move_chance_s)
-        q_xs[:, :, self.mask_index] = move_chance_s[:, :, 0]
-        _x = self.sample_categorical(q_xs)
-        copy_flag = (x != self.mask_index).to(x.dtype)
-        return p_x0, copy_flag * x + (1 - copy_flag) * _x
-    @torch.no_grad()
-    def sample_subs_guidance(self, n_samples, stride_length, num_strides, dt=0.001):
-        ones = torch.ones(n_samples, dtype=self.dtype,device=self.device)
-        num_steps = int(1 / dt)
-        sampling_steps = 0
-        intermediate_tokens = []
-        target = None
-        for _ in range(num_strides + 1):
-            p_x0_cache = None
-            x = self._sample_prior(n_samples,self.config.model.length).to(self.device)
-            if target is not None:
-                x[:, : -stride_length] = target
-            for i in range(num_steps + 1):
-                p_x0_cache, x_next = self.ddpm_caching_update(x=x, t=(1 - i * dt) * ones, dt=dt, p_x0=p_x0_cache)
-                if (not torch.allclose(x_next, x) or self.time_conditioning):
-                    p_x0_cache = None
-                    sampling_steps += 1
-                x = x_next
-            x = self.forward(x, 0 * ones).argmax(dim=-1)
-            intermediate_tokens.append(x[:, :stride_length].cpu().numpy())
-            target = x[:, stride_length:]
-        intermediate_tokens.append(target.cpu().numpy())
-        intermediate_text_samples = []
-        sequence_lengths = ((np.concatenate(intermediate_tokens, axis=1)[:, 1:]
-                                 == self.tokenizer.eos_token_id).cumsum(-1) == 0).sum(-1)
-        for i in range(2, len(intermediate_tokens) + 1):
-            intermediate_text_samples.append(self.tokenizer.decode(np.concatenate(intermediate_tokens[:i], axis=1)))
-        return (sampling_steps, intermediate_text_samples,
-            sequence_lengths)
-    def restore_model_and_semi_ar_sample(self, stride_length, num_strides, dt=0.001):
-        """Generate samples from the model."""
-        # Lightning auto-casting is not working in this method for some reason
-        self.backbone.eval()
-        self.noise.eval()
-        (sampling_steps, samples, sequence_lengths) = self.sample_subs_guidance(n_samples=self.config.Loader.BATCH_SIZE,stride_length=stride_length,num_strides=num_strides,dt=dt)
-        self.backbone.train()
-        self.noise.train()
-        return sampling_steps, samples, sequence_lengths

scripts/generate.py DELETED Viewed

@@ -1,131 +0,0 @@
-import torch
-import numpy as np
-from transformers import AutoTokenizer, AutoModel
-from models.diffusion import Diffusion
-from configs.config import Config
-from utils.esm_utils import load_esm2_model, get_latents
-def mask_sequence(sequence, mask_char='X'):
-    """Masks parts of the sequence based on the mask_char."""
-    mask_indices = [i for i, char in enumerate(sequence) if char == mask_char]
-    masked_sequence = sequence.replace(mask_char, '[MASK]')
-    return masked_sequence, mask_indices
-def generate_filled_sequence(model, tokenizer, esm_model, masked_sequence, mask_indices):
-    """Generates the filled sequence for the masked regions."""
-    inputs = tokenizer(masked_sequence, return_tensors="pt")
-    with torch.no_grad():
-        outputs = esm_model(**inputs)
-    latents = outputs.last_hidden_state.squeeze(0)
-    sigma = torch.rand(1, device=latents.device)
-    noisy_latents = model.forward(latents, sigma)
-    denoised_latents = model.reverse_diffusion(noisy_latents, sigma)
-    filled_sequence = list(masked_sequence)
-    for idx in mask_indices:
-        token_id = torch.argmax(denoised_latents[idx]).item()
-        filled_sequence[idx] = tokenizer.decode([token_id])
-    return ''.join(filled_sequence)
-def generate_scaffold_sequence(model, tokenizer, esm_model, peptides, final_length):
-    """Generates a scaffold sequence to connect multiple peptides."""
-    total_peptide_length = sum(len(peptide) for peptide in peptides)
-    scaffold_length = final_length - total_peptide_length
-    if scaffold_length <= 0:
-        raise ValueError("Final length must be greater than the combined length of the peptides.")
-    scaffold = "[MASK]" * scaffold_length
-    masked_sequence = "".join(peptides[:1] + [scaffold] + peptides[1:])
-    inputs = tokenizer(masked_sequence, return_tensors="pt")
-    with torch.no_grad():
-        outputs = esm_model(**inputs)
-    latents = outputs.last_hidden_state.squeeze(0)
-    sigma = torch.rand(1, device=latents.device)
-    noisy_latents = model.forward(latents, sigma)
-    denoised_latents = model.reverse_diffusion(noisy_latents, sigma)
-    filled_sequence = list(masked_sequence)
-    scaffold_start = len(peptides[0])
-    scaffold_end = scaffold_start + scaffold_length
-    for idx in range(scaffold_start, scaffold_end):
-        token_id = torch.argmax(denoised_latents[idx]).item()
-        filled_sequence[idx] = tokenizer.decode([token_id])
-    return ''.join(filled_sequence)
-def generate_de_novo_sequence(model, tokenizer, esm_model, sequence_length):
-    """Generates a de novo protein sequence of the specified length."""
-    scaffold = "[MASK]" * sequence_length
-    masked_sequence = scaffold
-    inputs = tokenizer(masked_sequence, return_tensors="pt")
-    with torch.no_grad():
-        outputs = esm_model(**inputs)
-    latents = outputs.last_hidden_state.squeeze(0)
-    sigma = torch.rand(1, device=latents.device)
-    noisy_latents = model.forward(latents, sigma)
-    denoised_latents = model.reverse_diffusion(noisy_latents, sigma)
-    filled_sequence = list(masked_sequence)
-    for idx in range(sequence_length):
-        token_id = torch.argmax(denoised_latents[idx]).item()
-        filled_sequence[idx] = tokenizer.decode([token_id])
-    return ''.join(filled_sequence)
-if __name__ == "__main__":
-    import argparse
-    # Argument parsing
-    parser = argparse.ArgumentParser(description="Generate protein sequences using latent diffusion model.")
-    subparsers = parser.add_subparsers(dest="mode")
-    # Subparser for the first strategy (multiple peptides to scaffold)
-    parser_scaffold = subparsers.add_parser("scaffold", help="Generate scaffold to connect multiple peptides.")
-    parser_scaffold.add_argument("peptides", nargs='+', help="Peptides to connect.")
-    parser_scaffold.add_argument("final_length", type=int, help="Final length of the protein sequence.")
-    # Subparser for the second strategy (fill in regions)
-    parser_fill = subparsers.add_parser("fill", help="Fill in specified regions in a given protein sequence.")
-    parser_fill.add_argument("sequence", help="Protein sequence with regions to fill specified by 'X'.")
-    # Subparser for the third strategy (de novo generation)
-    parser_de_novo = subparsers.add_parser("de_novo", help="Generate a de novo protein sequence.")
-    parser_de_novo.add_argument("sequence_length", type=int, help="Length of the de novo generated protein sequence.")
-    args = parser.parse_args()
-    # Load configurations
-    config = Config()
-    # Load models
-    tokenizer, esm_model = load_esm2_model(config.model_name)
-    diffusion_model = Diffusion.load_from_checkpoint(config.training["save_dir"] + "example.ckpt", config=config, latent_dim=config.latent_dim)
-    diffusion_model.eval()
-    if args.mode == "scaffold":
-        peptides = args.peptides
-        final_length = args.final_length
-        filled_sequence = generate_scaffold_sequence(diffusion_model, tokenizer, esm_model, peptides, final_length)
-        print(f"Peptides:          {' '.join(peptides)}")
-        print(f"Final Length:      {final_length}")
-        print(f"Generated Protein: {filled_sequence}")
-    elif args.mode == "fill":
-        sequence = args.sequence
-        masked_sequence, mask_indices = mask_sequence(sequence)
-        filled_sequence = generate_filled_sequence(diffusion_model, tokenizer, esm_model, masked_sequence, mask_indices)
-        print(f"Original Sequence: {sequence}")
-        print(f"Masked Sequence:   {masked_sequence}")
-        print(f"Filled Sequence:   {filled_sequence}")
-    elif args.mode == "de_novo":
-        sequence_length = args.sequence_length
-        filled_sequence = generate_de_novo_sequence(diffusion_model, tokenizer, esm_model, sequence_length)
-        print(f"De Novo Sequence Length: {sequence_length}")
-        print(f"Generated Protein: {filled_sequence}")

scripts/test.py DELETED Viewed

@@ -1,17 +0,0 @@
-import pytorch_lightning as L
-from configs.config import Config
-from utils.data_loader import get_dataloaders
-from models.diffusion import Diffusion
-# Get dataloaders
-_, _, test_loader = get_dataloaders(Config)
-# Initialize model
-checkpoint_path = Config.training["save_dir"] + "example.ckpt"
-latent_diffusion_model = Diffusion.load_from_checkpoint(checkpoint_path, config=Config, latent_dim=Config.latent_dim)
-# Initialize trainer
-trainer = L.Trainer(gpus=Config.training["gpus"], precision=Config.training["precision"])
-# Test the model
-trainer.test(latent_diffusion_model, test_loader)

scripts/train.py DELETED Viewed

@@ -1,50 +0,0 @@
-import pytorch_lightning as L
-from pytorch_lightning.strategies import DDPStrategy
-from pytorch_lightning.callbacks import ModelCheckpoint
-import config
-from data_loader import get_dataloaders
-from esm_utils import load_esm2_model
-from diffusion import Diffusion
-import wandb
-import sys
-# Get dataloaders
-train_loader, val_loader, _ = get_dataloaders(config)
-# Initialize ESM tokenizer and model
-tokenizer, _, _ = load_esm2_model(config.MODEL_NAME)
-# Initialize diffusion model
-latent_diffusion_model = Diffusion(config, latent_dim=config.LATENT_DIM, tokenizer=tokenizer)
-print(latent_diffusion_model)
-sys.stdout.flush()
-# Define checkpoints to save best model by minimum validation loss
-checkpoint_callback = ModelCheckpoint(
-    monitor='val_loss',
-    save_top_k=1,
-    mode='min',
-    dirpath="/workspace/a03-sgoel/MDpLM/",
-    filename="best_model_epoch{epoch:02d}"
-)
-# Initialize trainer
-trainer = L.Trainer(
-    max_epochs=config.Training.NUM_EPOCHS,
-    precision=config.Training.PRECISION,
-    devices=1,
-    accelerator='gpu',
-    strategy=DDPStrategy(find_unused_parameters=False),
-    accumulate_grad_batches=config.Training.ACCUMULATE_GRAD_BATCHES,
-    default_root_dir=config.Training.SAVE_DIR,
-    callbacks=[checkpoint_callback]
-)
-print(trainer)
-print("Training model...")
-sys.stdout.flush()
-# Train the model
-trainer.fit(latent_diffusion_model, train_loader, val_loader)
-wandb.finish()