ChatterjeeLab
/

muPPIt

Model card Files Files and versions Community

AlienChen commited on Mar 29

Commit

65bd8af

verified ·

1 Parent(s): ad15e9b

Upload 139 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

binder_generator_train.py +241 -0
finetune.py +385 -0
muppit/.gitignore +7 -0
muppit/LICENSE +201 -0
muppit/README.md +250 -0
muppit/__pycache__/classifier.cpython-310.pyc +0 -0
muppit/__pycache__/dataloader.cpython-310.pyc +0 -0
muppit/__pycache__/diffusion.cpython-310.pyc +0 -0
muppit/__pycache__/noise_schedule.cpython-310.pyc +0 -0
muppit/__pycache__/utils.cpython-310.pyc +0 -0
muppit/classifier.py +490 -0
muppit/configs/callbacks/checkpoint_every_n_steps.yaml +8 -0
muppit/configs/callbacks/checkpoint_monitor.yaml +10 -0
muppit/configs/callbacks/learning_rate_monitor.yaml +3 -0
muppit/configs/classifier_model/dimamba-classifier.yaml +14 -0
muppit/configs/classifier_model/hyenadna-classifier.yaml +4 -0
muppit/configs/classifier_model/small-classifier.yaml +11 -0
muppit/configs/classifier_model/tiny-classifier.yaml +11 -0
muppit/configs/classifier_model/tiny-dimamba-classifier.yaml +14 -0
muppit/configs/config.yaml +104 -0
muppit/configs/data/amazon_polarity.yaml +10 -0
muppit/configs/data/cifar10.yaml +11 -0
muppit/configs/data/lm1b.yaml +8 -0
muppit/configs/data/peptide.yaml +8 -0
muppit/configs/data/protein.yaml +8 -0
muppit/configs/data/qm9.yaml +11 -0
muppit/configs/data/ten_species.yaml +11 -0
muppit/configs/data/text8.yaml +9 -0
muppit/configs/guidance/cbg.yaml +5 -0
muppit/configs/guidance/cfg.yaml +3 -0
muppit/configs/guidance/fudge.yaml +5 -0
muppit/configs/guidance/nos.yaml +6 -0
muppit/configs/guidance/pplm.yaml +6 -0
muppit/configs/lr_scheduler/constant_warmup.yaml +2 -0
muppit/configs/lr_scheduler/cosine_decay_warmup.yaml +7 -0
muppit/configs/model/dimamba.yaml +12 -0
muppit/configs/model/fudge_predictor.yaml +4 -0
muppit/configs/model/hf.yaml +2 -0
muppit/configs/model/medium.yaml +10 -0
muppit/configs/model/small.yaml +11 -0
muppit/configs/model/tiny.yaml +10 -0
muppit/configs/model/unet.yaml +19 -0
muppit/configs/model/unet_campbell.yaml +19 -0
muppit/configs/noise/ar.yaml +2 -0
muppit/configs/noise/linear.yaml +3 -0
muppit/configs/noise/loglinear.yaml +3 -0
muppit/configs/noise/polynomial.yaml +5 -0
muppit/configs/strategy/ddp.yaml +2 -0
muppit/configs/strategy/fsdp.yaml +3 -0
muppit/custom_datasets/__init__.py +2 -0

binder_generator_train.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import torch
+from torch.utils.data import DataLoader, Subset
+from torch.optim import AdamW
+import torch.nn.functional as F
+import torch.nn as nn
+from datasets import load_from_disk
+import esm
+import numpy as np
+import math
+import os
+from transformers import AutoTokenizer
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from transformers import get_linear_schedule_with_warmup
+from tqdm import tqdm
+from torch.cuda.amp import autocast, GradScaler
+import gc
+import pdb
+os.environ['CUDA_VISIBLE_DEVICES'] = '1'
+##################### Hyper-parameters #############################################
+max_epochs = 30
+batch_size = 4
+lr = 1e-4
+num_layers = 4
+num_heads = 4
+accumulation_steps = 4
+checkpoint_path = '/home/tc415/muPPIt_embedding/checkpoints/generator_0'
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print(f'''
+max_epochs = 30
+batch_size = 4
+lr = 1e-4
+num_layers = 4
+num_heads = 4
+accumulation_steps = 4
+checkpoint_path = '/home/tc415/muPPIt_embedding/checkpoints/generator_0'
+''')
+####################################################################################
+os.makedirs(checkpoint_path, exist_ok=True)
+train_dataset = load_from_disk('/home/tc415/muPPIt_embedding/dataset/train/ppiref_generator')
+val_dataset = load_from_disk('/home/tc415/muPPIt_embedding/dataset/val/ppiref_generator')
+tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
+print(len(train_dataset), len(val_dataset))
+def collate_fn(batch):
+    # Unpack the batch
+    binders = []
+    targets = []
+    global tokenizer
+    for b in batch:
+        binder = torch.tensor(b['binder_input_ids']['input_ids'][1:-1])
+        target = torch.tensor(b['target_input_ids']['input_ids'][1:-1])
+        if binder.dim() == 0 or binder.numel() == 0 or target.dim() == 0 or target.numel() == 0:
+            continue
+        binders.append(binder)  # shape: 1*L1 -> L1
+        targets.append(target)  # shape: 1*L2 -> L2
+    # Collate the tensors using torch's pad_sequence
+    try:
+        binder_input_ids = torch.nn.utils.rnn.pad_sequence(binders, batch_first=True, padding_value=tokenizer.pad_token_id)
+        target_input_ids = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=tokenizer.pad_token_id)
+    except:
+        pdb.set_trace()
+    # Return the collated batch
+    return {
+        'binder_input_ids': binder_input_ids.long(),
+        'target_input_ids': target_input_ids.long(),
+    }
+def RoPE(x, seq_dim=0):
+    """
+    Applies Rotary Positional Encoding to the input embeddings.
+    :param x: Input tensor (seq_len, batch_size, embed_dim)
+    :param seq_dim: The sequence dimension, usually 0 (first dimension in (seq_len, batch_size, embed_dim))
+    :return: Tensor with RoPE applied (seq_len, batch_size, embed_dim)
+    """
+    seq_len = x.shape[seq_dim]
+    d_model = x.shape[-1]
+    # Create the positions and the sine-cosine rotational matrices
+    theta = torch.arange(0, d_model, 2, dtype=torch.float32) / d_model
+    theta = 10000 ** (-theta)  # scaling factor for RoPE
+    seq_idx = torch.arange(seq_len, dtype=torch.float32).unsqueeze(1)
+    # Compute sine and cosine embedding for each position
+    sin_emb = torch.sin(seq_idx * theta)
+    cos_emb = torch.cos(seq_idx * theta)
+    sin_emb = sin_emb.unsqueeze(1)  # [seq_len, 1, embed_dim//2]
+    cos_emb = cos_emb.unsqueeze(1)  # [seq_len, 1, embed_dim//2]
+    x1, x2 = x[..., ::2], x[..., 1::2]  # Split embedding into even and odd indices
+    cos_emb = cos_emb.to(x1.device)
+    sin_emb = sin_emb.to(x1.device)
+    # Apply rotary transformation
+    x_rotated = torch.cat([x1 * cos_emb - x2 * sin_emb, x1 * sin_emb + x2 * cos_emb], dim=-1)
+    return x_rotated
+class BinderGenerator(nn.Module):
+    def __init__(self, vocab_size=24, embed_dim=1280, num_heads=4, num_layers=4, lr=1e-4):
+        super(BinderGenerator, self).__init__()
+        self.esm, self.alphabet = esm.pretrained.esm2_t33_650M_UR50D()
+        for param in self.esm.parameters():
+            param.requires_grad = False
+        self.transformer = nn.Transformer(d_model=embed_dim, nhead=num_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers)
+        self.fc_out = nn.Linear(embed_dim, vocab_size)
+        self.criterion = nn.CrossEntropyLoss(ignore_index=self.alphabet.padding_idx)
+        self.vocab_size = vocab_size
+        self.learning_rate = lr
+    def forward(self, binder_tokens, target_tokens):
+        with torch.no_grad():
+            binder_pad_mask = (binder_tokens != self.alphabet.padding_idx).int()
+            binder_embed = self.esm(binder_tokens, repr_layers=[33], return_contacts=True)["representations"][33] * binder_pad_mask.unsqueeze(-1)
+            target_pad_mask = (target_tokens != self.alphabet.padding_idx).int()
+            target_embed = self.esm(target_tokens, repr_layers=[33], return_contacts=True)["representations"][33] * target_pad_mask.unsqueeze(-1)
+        binder_embed = binder_embed.transpose(0,1)
+        target_embed = target_embed.transpose(0,1)
+        binder_embed = RoPE(binder_embed)  # [src_len, batch_size, embed_dim]
+        target_embed = RoPE(target_embed)  # [tgt_len, batch_size, embed_dim]
+        output = self.transformer(binder_embed, target_embed)  # [tgt_len, batch_size, embed_dim]
+        return self.fc_out(output).transpose(0,1)  # [batch_size, tgt_len, vocab_size]
+    def compute_loss(self, binder_tokens, target_tokens):
+        output = self.forward(binder_tokens, target_tokens)
+        loss = self.criterion(output[:, :-1, :].reshape(-1, self.vocab_size), target_tokens[:, 1:].reshape(-1))
+        return loss
+    def step(self, batch, compute_acc=False):
+        binder_tokens = batch['binder_input_ids']
+        target_tokens = batch['target_input_ids']
+        binder_tokens = binder_tokens.to(device)
+        target_tokens = target_tokens.to(device)
+        loss = self.compute_loss(binder_tokens, target_tokens)
+        if compute_acc:
+            preds = torch.argmax(output[:-1], dim=-1)
+            correct = (preds == target_tokens[1:]).sum().item()
+            accuracy = correct / (target_tokens[1:] != self.alphabet.padding_idx).sum().item()
+            return loss, accuracy
+        else:
+            return loss
+def train(model, optimizer, scheduler, cosine_scheduler, train_dataset, val_dataset, batch_size, max_epochs=10, accumulation_steps=4):
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True, num_workers=4)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False, num_workers=4)
+    max_val_acc = 0
+    for epoch in range(max_epochs):
+        print(f"Epoch {epoch + 1}/{max_epochs}")
+        scaler = GradScaler()
+        model.train()
+        running_loss = 0.0
+        optimizer.zero_grad()
+        for batch_idx, batch in tqdm(enumerate(train_loader), total=len(train_loader)):
+            batch = {k: v.cuda(non_blocking=True) for k, v in batch.items()}    # Transfer batch to GPU
+            with autocast():
+                loss = model.step(batch)
+            scaler.scale(loss).backward()
+            if (batch_idx + 1) % accumulation_steps == 0:
+                scaler.step(optimizer)
+                scaler.update()
+                optimizer.zero_grad()
+                if scheduler.last_epoch < warmup_steps:
+                    scheduler.step()
+                else:
+                    cosine_scheduler.step()
+            running_loss += loss.item()
+        print(f"Epoch {epoch}: Training Loss = {running_loss / len(train_loader)}")
+        del train_loader, running_loss
+        gc.collect()
+        torch.cuda.empty_cache()
+        model.eval()
+        val_loss = 0.0
+        val_acc = 0.0
+        with torch.no_grad():
+            for batch in tqdm(val_loader, total=len(val_loader)):
+                batch = {k: v.cuda(non_blocking=True) for k, v in batch.items()}
+                val_loss_batch, val_acc_batch = model.step(batch, compute_acc=True)
+                val_loss += val_loss_batch.item()
+                val_acc += val_acc_batch.item()
+        print(f"Epoch {epoch}: Val Loss = {val_loss / len(val_loader)}\tVal Acc = {val_acc / len(val_dataset)}")
+        if val_acc > max_val_acc:
+            max_val_acc = val_acc
+        global checkpoint_path
+        torch.save(model.state_dict(), os.path.join(checkpoint_path, f"epoch={epoch}_acc={round(val_acc / len(val_dataset), 2)}"))
+model = BinderGenerator(vocab_size=24, embed_dim=1280, num_heads=num_heads, num_layers=num_layers, lr=lr).to(device)
+optimizer = AdamW(model.parameters(), lr=model.learning_rate, betas=(0.9, 0.95), weight_decay=1e-5)
+total_steps = len(train_dataset) // (batch_size*accumulation_steps) * max_epochs  # Assuming batch_size=32, max_epochs=10
+warmup_steps = int(0.1 * total_steps)
+scheduler = get_linear_schedule_with_warmup(
+    optimizer,
+    num_warmup_steps=warmup_steps,
+    num_training_steps=total_steps
+)
+cosine_scheduler = CosineAnnealingLR(optimizer, T_max=total_steps - warmup_steps, eta_min=0.1*lr)
+train(model, optimizer, scheduler, cosine_scheduler, train_dataset, val_dataset, batch_size=batch_size, max_epochs=max_epochs, accumulation_steps=accumulation_steps)

finetune.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import pdb
+from pytorch_lightning.strategies import DDPStrategy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, DistributedSampler, BatchSampler, Sampler
+from datasets import load_from_disk
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, \
+    Timer, TQDMProgressBar, LearningRateMonitor, StochasticWeightAveraging, GradientAccumulationScheduler
+from pytorch_lightning.loggers import WandbLogger
+from torch.optim.lr_scheduler import _LRScheduler
+from transformers.optimization import get_cosine_schedule_with_warmup
+from argparse import ArgumentParser
+import os
+import uuid
+import esm
+import numpy as np
+import torch.distributed as dist
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoTokenizer, get_cosine_schedule_with_warmup
+# from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
+from torch.optim import Adam, AdamW
+from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef
+import torch_geometric.nn as pyg_nn
+import gc
+import math
+# os.environ["TORCH_CPP_LOG_LEVEL"]="INFO"
+# os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+vhse8_values = {
+    'A': [0.15, -1.11, -1.35, -0.92, 0.02, -0.91, 0.36, -0.48],
+    'R': [-1.47, 1.45, 1.24, 1.27, 1.55, 1.47, 1.30, 0.83],
+    'N': [-0.99, 0.00, 0.69, -0.37, -0.55, 0.85, 0.73, -0.80],
+    'D': [-1.15, 0.67, -0.41, -0.01, -2.68, 1.31, 0.03, 0.56],
+    'C': [0.18, -1.67, -0.21, 0.00, 1.20, -1.61, -0.19, -0.41],
+    'Q': [-0.96, 0.12, 0.18, 0.16, 0.09, 0.42, -0.20, -0.41],
+    'E': [-1.18, 0.40, 0.10, 0.36, -2.16, -0.17, 0.91, 0.36],
+    'G': [-0.20, -1.53, -2.63, 2.28, -0.53, -1.18, -1.34, 1.10],
+    'H': [-0.43, -0.25, 0.37, 0.19, 0.51, 1.28, 0.93, 0.65],
+    'I': [1.27, 0.14, 0.30, -1.80, 0.30, -1.61, -0.16, -0.13],
+    'L': [1.36, 0.07, 0.26, -0.80, 0.22, -1.37, 0.08, -0.62],
+    'K': [-1.17, 0.70, 0.80, 1.64, 0.67, 1.63, 0.13, -0.01],
+    'M': [1.01, -0.53, 0.43, 0.00, 0.23, 0.10, -0.86, -0.68],
+    'F': [1.52, 0.61, 0.95, -0.16, 0.25, 0.28, -1.33, -0.65],
+    'P': [0.22, -0.17, -0.50, -0.05, 0.01, -1.34, 0.19, 3.56],
+    'S': [-0.67, -0.86, -1.07, -0.41, -0.32, 0.27, -0.64, 0.11],
+    'T': [-0.34, -0.51, -0.55, -1.06, 0.01, -0.01, -0.79, 0.39],
+    'W': [1.50, 2.06, 1.79, 0.75, 0.75, 0.13, -1.06, -0.85],
+    'Y': [0.61, 1.60, 1.17, 0.73, 0.53, 0.25, -0.96, -0.52],
+    'V': [0.76, -0.92, 0.17, -1.91, 0.22, -1.40, -0.24, -0.03],
+}
+aa_to_idx = {'A': 5, 'R': 10, 'N': 17, 'D': 13, 'C': 23, 'Q': 16, 'E': 9, 'G': 6, 'H': 21, 'I': 12, 'L': 4, 'K': 15, 'M': 20, 'F': 18, 'P': 14, 'S': 8, 'T': 11, 'W': 22, 'Y': 19, 'V': 7}
+vhse8_tensor = torch.zeros(24, 8)
+for aa, values in vhse8_values.items():
+    aa_index = aa_to_idx[aa]
+    vhse8_tensor[aa_index] = torch.tensor(values)
+vhse8_tensor.requires_grad = False
+def collate_fn(batch):
+    # Unpack the batch
+    binders = []
+    mutants = []
+    wildtypes = []
+    affs = []
+    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
+    for b in batch:
+        binder = torch.tensor(b['binder_input_ids']['input_ids'][1:-1])
+        mutant = torch.tensor(b['mutant_input_ids']['input_ids'][1:-1])
+        wildtype = torch.tensor(b['wildtype_input_ids']['input_ids'][1:-1])
+        if binder.dim() == 0 or binder.numel() == 0 or mutant.dim() == 0 or mutant.numel() == 0 or wildtype.dim() == 0 or wildtype.numel() == 0:
+            continue
+        binders.append(binder)  # shape: 1*L1 -> L1
+        mutants.append(mutant)  # shape: 1*L2 -> L2
+        wildtypes.append(wildtype)  # shape: 1*L3 -> L3
+        affs.append(b['aff'])
+    # Collate the tensors using torch's pad_sequence
+    try:
+        binder_input_ids = torch.nn.utils.rnn.pad_sequence(binders, batch_first=True, padding_value=tokenizer.pad_token_id)
+        mutant_input_ids = torch.nn.utils.rnn.pad_sequence(mutants, batch_first=True, padding_value=tokenizer.pad_token_id)
+        wildtype_input_ids = torch.nn.utils.rnn.pad_sequence(wildtypes, batch_first=True, padding_value=tokenizer.pad_token_id)
+    except:
+        pdb.set_trace()
+    affs = torch.tensor(affs)
+    # Return the collated batch
+    return {
+        'binder_input_ids': binder_input_ids.int(),
+        'mutant_input_ids': mutant_input_ids.int(),
+        'wildtype_input_ids': wildtype_input_ids.int(),
+        'aff': affs
+    }
+class CustomDataModule(pl.LightningDataModule):
+    def __init__(self, train_dataset, val_dataset, tokenizer, batch_size: int = 128):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.batch_size = batch_size
+        self.tokenizer = tokenizer
+        print(len(train_dataset))
+        print(len(val_dataset))
+    def train_dataloader(self):
+        # batch_sampler = LengthAwareDistributedSampler(self.train_dataset, 'mutant_tokens', self.batch_size)
+        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn,
+                          num_workers=8, pin_memory=True)
+    def val_dataloader(self):
+        # batch_sampler = LengthAwareDistributedSampler(self.val_dataset, 'mutant_tokens', self.batch_size)
+        return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=collate_fn, num_workers=8,
+                          pin_memory=True)
+    def setup(self, stage=None):
+        if stage == 'test' or stage is None:
+            pass
+class CosineAnnealingWithWarmup(_LRScheduler):
+    def __init__(self, optimizer, warmup_steps, total_steps, base_lr, max_lr, min_lr, last_epoch=-1):
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.base_lr = base_lr
+        self.max_lr = max_lr
+        self.min_lr = min_lr
+        super(CosineAnnealingWithWarmup, self).__init__(optimizer, last_epoch)
+        print(f"SELF BASE LRS = {self.base_lrs}")
+    def get_lr(self):
+        if self.last_epoch < self.warmup_steps:
+            # Linear warmup phase from base_lr to max_lr
+            return [self.base_lr + (self.max_lr - self.base_lr) * (self.last_epoch / self.warmup_steps) for base_lr in self.base_lrs]
+        # Cosine annealing phase from max_lr to min_lr
+        progress = (self.last_epoch - self.warmup_steps) / (self.total_steps - self.warmup_steps)
+        cosine_decay = 0.5 * (1 + np.cos(np.pi * progress))
+        decayed_lr = self.min_lr + (self.max_lr - self.min_lr) * cosine_decay
+        return [decayed_lr for base_lr in self.base_lrs]
+class muPPIt(pl.LightningModule):
+    def __init__(self, d_node, num_heads, dropout, margin, lr):
+        super(muPPIt, self).__init__()
+        self.esm, self.alphabet = esm.pretrained.esm2_t33_650M_UR50D()
+        for param in self.esm.parameters():
+            param.requires_grad = False
+        self.attention = nn.MultiheadAttention(embed_dim=d_node, num_heads=num_heads)
+        self.layer_norm = nn.LayerNorm(d_node)
+        self.map = nn.Sequential(
+            nn.Linear(d_node, d_node // 2),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_node // 2, 1)
+        )
+        self.margin = margin
+        self.learning_rate = lr
+        for layer in self.map:
+            if isinstance(layer, nn.Linear):
+                nn.init.kaiming_uniform_(layer.weight, a=0, mode='fan_in', nonlinearity='leaky_relu')
+                if layer.bias is not None:
+                    nn.init.zeros_(layer.bias)
+    def forward(self, binder_tokens, wt_tokens, mut_tokens):
+        device = binder_tokens.device
+        global vhse8_tensor
+        vhse8_tensor = vhse8_tensor.to(device)
+        with torch.no_grad():
+            binder_pad_mask = (binder_tokens != self.alphabet.padding_idx).int()
+            binder_embed = self.esm(binder_tokens, repr_layers=[33], return_contacts=True)["representations"][33] * binder_pad_mask.unsqueeze(-1)
+            binder_vhse8 = vhse8_tensor[binder_tokens]
+            binder_embed = torch.concat([binder_embed, binder_vhse8], dim=-1)
+            mut_pad_mask = (mut_tokens != self.alphabet.padding_idx).int()
+            mut_embed = self.esm(mut_tokens, repr_layers=[33], return_contacts=True)["representations"][33] * mut_pad_mask.unsqueeze(-1)
+            mut_vhse8 = vhse8_tensor[mut_tokens]
+            mut_embed = torch.concat([mut_embed, mut_vhse8], dim=-1)
+            wt_pad_mask = (wt_tokens != self.alphabet.padding_idx).int()
+            wt_embed = self.esm(wt_tokens, repr_layers=[33], return_contacts=True)["representations"][33] * wt_pad_mask.unsqueeze(-1)
+            wt_vhse8 = vhse8_tensor[wt_tokens]
+            wt_embed = torch.concat([wt_embed, wt_vhse8], dim=-1)
+        binder_wt = torch.concat([binder_embed, wt_embed], dim=1)
+        binder_mut = torch.concat([binder_embed, mut_embed], dim=1)
+        binder_wt_attn, _ = self.attention(binder_wt, binder_wt, binder_wt)
+        binder_mut_attn, _ = self.attention(binder_mut, binder_mut, binder_mut)
+        binder_wt_attn = binder_wt + binder_wt_attn
+        binder_mut_attn = binder_mut + binder_mut_attn
+        binder_wt_attn = self.layer_norm(binder_wt_attn)
+        binder_mut_attn = self.layer_norm(binder_mut_attn)
+        mapped_binder_wt = self.map(binder_wt_attn).squeeze(-1)      # B*(L1+L2)
+        mapped_binder_mut = self.map(binder_mut_attn).squeeze(-1)     # B*(L1+L2)
+        # mean_binder_wt = torch.mean(mapped_binder_wt, dim=1)
+        # mean_binder_mut = torch.mean(mapped_binder_mut, dim=1)
+        distance = torch.sqrt(torch.sum((mapped_binder_wt - mapped_binder_mut) ** 2, dim=-1))
+        return distance
+    def training_step(self, batch, batch_idx):
+        opt = self.optimizers()
+        lr = opt.param_groups[0]['lr']
+        self.log('learning_rate', lr, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
+        binder_tokens = batch['binder_input_ids'].to(self.device)
+        mut_tokens = batch['mutant_input_ids'].to(self.device)
+        wt_tokens = batch['wildtype_input_ids'].to(self.device)
+        aff = batch['aff'].to(self.device)
+        distance = self.forward(binder_tokens, wt_tokens, mut_tokens)
+        # pdb.set_trace()
+        # loss = torch.clamp(self.margin * aff - distance, min=0)
+        upper_loss = F.relu(distance - self.margin *(aff + 1))   # let distance < aff + 1
+        lower_loss = F.relu(self.margin * aff - distance) # let distance > aff
+        loss = 5 * upper_loss + lower_loss
+        self.log('train_loss', loss.mean().item(), on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
+        return loss.mean()
+    def validation_step(self, batch, batch_idx):
+        binder_tokens = batch['binder_input_ids'].to(self.device)
+        mut_tokens = batch['mutant_input_ids'].to(self.device)
+        wt_tokens = batch['wildtype_input_ids'].to(self.device)
+        aff = batch['aff'].to(self.device)
+        distance = self.forward(binder_tokens, wt_tokens, mut_tokens)
+        # pdb.set_trace()
+        # loss = torch.clamp(self.margin * aff - distance, min=0)
+        # accuracy = torch.sum(distance >= self.margin * aff) / aff.shape[0]
+        upper_loss = F.relu(distance - self.margin * (aff + 1))
+        lower_loss = F.relu(self.margin * aff - distance)
+        loss = 5 * upper_loss + lower_loss
+        accuracy = torch.sum(torch.logical_and(torch.ge(distance, self.margin * aff), torch.le(distance, self.margin *(aff + 1)))) / aff.shape[0]
+        self.log('val_loss', loss.mean().item(), on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
+        self.log('val_acc', accuracy.item(), on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
+    def configure_optimizers(self):
+        optimizer = AdamW(self.parameters(), lr=self.learning_rate, betas=(0.9, 0.95))
+        base_lr = 0.1 * self.learning_rate
+        max_lr = self.learning_rate
+        min_lr = 0.1 * self.learning_rate
+        schedulers = CosineAnnealingWithWarmup(optimizer, warmup_steps=119, total_steps=1188,
+                                              base_lr=base_lr, max_lr=max_lr, min_lr=min_lr)    # warmup_steps=3193, total_steps=31926
+        lr_schedulers = {
+            "scheduler": schedulers,
+            "name": 'learning_rate_logs',
+            "interval": 'step',  # The scheduler updates the learning rate at every step (not epoch)
+            'frequency': 1  # The scheduler updates the learning rate after every batch
+        }
+        return [optimizer], [lr_schedulers]
+    def on_training_epoch_end(self, outputs):
+        gc.collect()
+        torch.cuda.empty_cache()
+        super().training_epoch_end(outputs)
+    def load_weights(self, checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
+        state_dict = checkpoint['state_dict']
+        self.load_state_dict(state_dict, strict=True)
+def main(args):
+    print(args)
+    dist.init_process_group(backend='nccl')
+    train_dataset = load_from_disk('/home/tc415/muPPIt_embedding/dataset/train/affinity_embedding_skempi')  #408643
+    val_dataset = load_from_disk('/home/tc415/muPPIt_embedding/dataset/val/affinity_embedding_skempi')
+    # val_dataset = None
+    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
+    data_module = CustomDataModule(train_dataset, val_dataset, tokenizer=tokenizer, batch_size=args.batch_size)
+    model = muPPIt(args.d_node, args.num_heads, args.dropout, args.margin, args.lr)
+    print(f"Loading Pre-trained Model from {args.sm}")
+    model = muPPIt.load_from_checkpoint(args.sm, d_node=args.d_node, num_heads=args.num_heads, dropout=args.dropout, margin=args.margin, lr=args.lr)
+    run_id = str(uuid.uuid4())
+    logger = WandbLogger(project=f"muppit_embedding",
+                        #  name="debug",
+                         name=f"affinity_lr={args.lr}_gradclip={args.grad_clip}_margin={args.margin}",
+                         job_type='model-training',
+                         id=run_id)
+    print(f"Saving to {args.output_file}")
+    checkpoint_callback = ModelCheckpoint(
+        monitor='val_acc',
+        # monitor='val_loss',
+        dirpath=args.output_file,
+        # filename='model-{epoch:02d}-{val_loss:.2f}',
+        filename='model-{epoch:02d}-{val_acc:.2f}',
+        # filename='muppit',
+        save_top_k=-1,
+        mode='max',
+        # mode='min',
+        # every_n_train_steps=1000,
+        # save_on_train_epoch_end=False
+    )
+    early_stopping_callback = EarlyStopping(
+        # monitor='val_acc',
+        monitor='val_loss',
+        patience=10,
+        verbose=True,
+        # mode='max',
+        mode='min',
+    )
+    accumulator = GradientAccumulationScheduler(scheduling={0: 4})
+    trainer = pl.Trainer(
+        max_epochs=args.max_epochs,
+        accelerator='gpu',
+        strategy='ddp_find_unused_parameters_true',
+        precision='bf16',
+        # logger=logger,
+        devices=[0,1],
+        callbacks=[checkpoint_callback, accumulator],
+        gradient_clip_val=args.grad_clip,
+        # val_check_interval=100,
+    )
+    trainer.fit(model, datamodule=data_module)
+    best_model_path = checkpoint_callback.best_model_path
+    print(best_model_path)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("-o", dest="output_file", help="File for output of model parameters", required=True, type=str)
+    parser.add_argument("-lr", type=float, default=1e-3)
+    parser.add_argument("-batch_size", type=int, default=2, help="Batch size")
+    parser.add_argument("-grad_clip", type=float, default=0.5)
+    parser.add_argument("-margin", type=float, default=0.5)
+    parser.add_argument("-max_epochs", type=int, default=30)
+    parser.add_argument("-d_node", type=int, default=1024, help="Node Representation Dimension")
+    parser.add_argument("-num_heads", type=int, default=4)
+    parser.add_argument("-dropout", type=float, default=0.1)
+    parser.add_argument("-sm", type=str, default='/home/tc415/muPPIt_embedding/checkpoints/train_10/model-epoch=15-val_acc=0.62.ckpt')
+    args = parser.parse_args()
+    main(args)

muppit/.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+.idea/
+.DS_Store
+.ipynb_checkpoints/
+__pycache__/
+.hf_cache
+outputs/
+watch_folder/

muppit/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction,
+and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by
+the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all
+other entities that control, are controlled by, or are under common
+control with that entity. For the purposes of this definition,
+"control" means (i) the power, direct or indirect, to cause the
+direction or management of such entity, whether by contract or
+otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity
+exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications,
+including but not limited to software source code, documentation
+source, and configuration files.
+"Object" form shall mean any form resulting from mechanical
+transformation or translation of a Source form, including but
+not limited to compiled object code, generated documentation,
+and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or
+Object form, made available under the License, as indicated by a
+copyright notice that is included in or attached to the work
+(an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object
+form, that is based on (or derived from) the Work and for which the
+editorial revisions, annotations, elaborations, or other modifications
+represent, as a whole, an original work of authorship. For the purposes
+of this License, Derivative Works shall not include works that remain
+separable from, or merely link (or bind by name) to the interfaces of,
+the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including
+the original version of the Work and any modifications or additions
+to that Work or Derivative Works thereof, that is intentionally
+submitted to Licensor for inclusion in the Work by the copyright owner
+or by an individual or Legal Entity authorized to submit on behalf of
+the copyright owner. For the purposes of this definition, "submitted"
+means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems,
+and issue tracking systems that are managed by, or on behalf of, the
+Licensor for the purpose of discussing and improving the Work, but
+excluding communication that is conspicuously marked or otherwise
+designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity
+on behalf of whom a Contribution has been received by Licensor and
+subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the
+Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+(except as stated in this section) patent license to make, have made,
+use, offer to sell, sell, import, and otherwise transfer the Work,
+where such license applies only to those patent claims licensable
+by such Contributor that are necessarily infringed by their
+Contribution(s) alone or by combination of their Contribution(s)
+with the Work to which such Contribution(s) was submitted. If You
+institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work
+or a Contribution incorporated within the Work constitutes direct
+or contributory patent infringement, then any patent licenses
+granted to You under this License for that Work shall terminate
+as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the
+Work or Derivative Works thereof in any medium, with or without
+modifications, and in Source or Object form, provided that You
+meet the following conditions:
+(a) You must give any other recipients of the Work or
+Derivative Works a copy of this License; and
+(b) You must cause any modified files to carry prominent notices
+stating that You changed the files; and
+(c) You must retain, in the Source form of any Derivative Works
+that You distribute, all copyright, patent, trademark, and
+attribution notices from the Source form of the Work,
+excluding those notices that do not pertain to any part of
+the Derivative Works; and
+(d) If the Work includes a "NOTICE" text file as part of its
+distribution, then any Derivative Works that You distribute must
+include a readable copy of the attribution notices contained
+within such NOTICE file, excluding those notices that do not
+pertain to any part of the Derivative Works, in at least one
+of the following places: within a NOTICE text file distributed
+as part of the Derivative Works; within the Source form or
+documentation, if provided along with the Derivative Works; or,
+within a display generated by the Derivative Works, if and
+wherever such third-party notices normally appear. The contents
+of the NOTICE file are for informational purposes only and
+do not modify the License. You may add Your own attribution
+notices within Derivative Works that You distribute, alongside
+or as an addendum to the NOTICE text from the Work, provided
+that such additional attribution notices cannot be construed
+as modifying the License.
+You may add Your own copyright statement to Your modifications and
+may provide additional or different license terms and conditions
+for use, reproduction, or distribution of Your modifications, or
+for any such Derivative Works as a whole, provided Your use,
+reproduction, and distribution of the Work otherwise complies with
+the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise,
+any Contribution intentionally submitted for inclusion in the Work
+by You to the Licensor shall be under the terms and conditions of
+this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify
+the terms of any separate license agreement you may have executed
+with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade
+names, trademarks, service marks, or product names of the Licensor,
+except as required for reasonable and customary use in describing the
+origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or
+agreed to in writing, Licensor provides the Work (and each
+Contributor provides its Contributions) on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied, including, without limitation, any warranties or conditions
+of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE. You are solely responsible for determining the
+appropriateness of using or redistributing the Work and assume any
+risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory,
+whether in tort (including negligence), contract, or otherwise,
+unless required by applicable law (such as deliberate and grossly
+negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special,
+incidental, or consequential damages of any character arising as a
+result of this License or out of the use or inability to use the
+Work (including but not limited to damages for loss of goodwill,
+work stoppage, computer failure or malfunction, or any and all
+other commercial damages or losses), even if such Contributor
+has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing
+the Work or Derivative Works thereof, You may choose to offer,
+and charge a fee for, acceptance of support, warranty, indemnity,
+or other liability obligations and/or rights consistent with this
+License. However, in accepting such obligations, You may act only
+on Your own behalf and on Your sole responsibility, not on behalf
+of any other Contributor, and only if You agree to indemnify,
+defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason
+of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+APPENDIX: How to apply the Apache License to your work.
+To apply the Apache License to your work, attach the following
+boilerplate notice, with the fields enclosed by brackets "[]"
+replaced with your own identifying information. (Don't include
+the brackets!)  The text should be enclosed in the appropriate
+comment syntax for the file format. We also recommend that a
+file or class name and description of purpose be included on the
+same "printed page" as the copyright notice for easier
+identification within third-party archives.
+Copyright [yyyy] [name of copyright owner]
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

muppit/README.md ADDED Viewed

	@@ -0,0 +1,250 @@

+# Simple Guidance Mechanisms for Discrete Diffusion Models
+[![arXiv](https://img.shields.io/badge/arXiv-2412.10193-red.svg)](https://arxiv.org/abs/2412.10193)
+[![deploy](https://img.shields.io/badge/Blog%20%20-8A2BE2)](https://discrete-diffusion-guidance.github.io/)
+[![deploy](https://img.shields.io/badge/Huggingface%20-UDLM%20-blue)](https://huggingface.co/collections/kuleshov-group/udlm-675e63ab42bc757093099e1b)
+<p align="center">
+    <img src="https://discrete-diffusion-guidance.github.io/static/images/udlm.gif" alt="graphical abstract" width="450"/>
+</p>
+This repository contains code for reproducing experiments in the paper [Simple Guidance Mechanisms for Discrete Diffusion Models](https://arxiv.org/abs/2412.10193)
+We also share [trained models](https://huggingface.co/collections/kuleshov-group/udlm-675e63ab42bc757093099e1b) on HuggingFace 🤗 and support intergration with these models.
+See the "[Using HuggingFace Models" section](#using-huggingface-models) below.
+## Code Organization
+<a name="code-organization"></a>
+1. ```main.py```: Routines for training (language models and classifiers)
+2. ```noise_schedule.py```: Noise schedules
+3. ```diffusion.py```: Forward/reverse diffusion
+    - Absorbing state / uniform noise diffusion
+    - AR
+4. ```dataloader.py```: Dataloaders
+   - For Discretized CIFAR10 and the Species10 datasets we use custom dataset classes defined in ```custom_datasets/```
+5. ```utils.py```: LR scheduler, logging, `fsspec` handling
+6. ```models/```: Denoising network architectures.
+7. ```configs/```: Config files for datasets/denoising networks/noise schedules/LR schedules
+8. ```scripts/```: Shell scripts for training/evaluation
+9. ```guidance_eval/```: Guidance evaluation scripts
+### Implemented Decoding Mechanisms
+<a name="implemented-decoding"></a>
+In [`diffusion.py`](./diffusion.py),
+we define baseline and proposed decoding mechanisms for guidance.
+These decoding schemes can be controlled via the hydra config with the `guidance` field.
+For example, to use the proposed D-CFG guidance mechanism,
+set `guidance=cfg` in the config file and optionally set the `guidance.gamma` parameter to control the strength of the guidance signal.
+The implemented decoding methods are as follows:
+- AR (Baseline):
+   - Standard decoding (i.e., no-guidance); set `guidance=null`
+   - Classifier-free guidance (D-CFG); set `guidance=cfg`
+   - Classifier-based guidance using [FUDGE](https://arxiv.org/abs/2104.05218) (set `guidance=fudge`) and using [PPLM](https://arxiv.org/abs/1912.02164) (set `guidance=pplm`)
+- Diffusion:
+  - Standard decoding (i.e., no guidance); set `guidance=null`
+  - Classifier-free guidance (D-CFG); set `guidance=cfg`
+  - Classifier-based guidance (D-CBG); set `guidance=cbg`
+  - Classifier-based (baseline) method of [NOS](https://arxiv.org/abs/2305.20009); set `guidance=nos`
+### Implemented Generative Models
+<a name="implemented-models"></a>
+The three modeling parameterizations
+we explore in this work are:
+1. Autoregressive (AR) Models
+2. Masked Diffusion Language Models (MDLM)
+3. Uniform Diffusion Language Models (UDLM)
+The `config` files can be used
+to specify which of these parameterizations to use.
+Below we detail which config parameters correspond to which model.
+**AR**
+```bash
+diffusion="absorbing_state"  # AR models can be thought of as a special case of abosrbing state diffusion models
+parameterization="ar"
+T=0  # N/A for AR models, this is a placeholder
+time_conditioning=False  # AR models are not conditioned on time
+zero_recon_loss=False  # N/A for this model
+```
+**MDLM**
+```bash
+diffusion="absorbing_state"
+parameterization="subs"  # See MDLM paper for details: https://arxiv.org/abs/2406.07524
+T=0  # Indicates continuous-time, e.g. T --> infinity
+time_conditioning=False  # MDLM not conditioned on time
+zero_recon_loss=False  # N/A for this model
+```
+**UDLM**
+```bash
+diffusion="uniform"
+parameterization="d3pm"  # Indicates that we explicitly compute KL on posteriors
+T=0  # Indicates continuous-time, e.g. T --> infinity
+time_conditioning=True  # UDLM is conditioned on time
+zero_recon_loss=True  # In continuous time, recon loss evaluates to zero
+```
+## Getting started in this repository
+<a name="getting-started"></a>
+To get started, create a conda environment containing the required dependencies.
+```bash
+conda env create -f requirements.yaml
+conda activate discdiff
+```
+Create the following directories to store saved models and slurm logs:
+```bash
+mkdir outputs
+mkdir watch_folder
+```
+We rely on `wandb` integration
+to log experiments and eval curves.
+## Reproducing Experiments
+<a name="reproducing-experiments"></a>
+Below, we describe the steps required for reproducing the experiments in the paper.
+Throughout, the main entry point for running experiments is the [`main.py`](./main.py) script.
+We also provide sample `slurm` scripts for launching pre-training and evaluation experiments in the [`scrips/`](./scripts) directory.
+### Language Modeling Experiments
+<a name="lm_training"></a>
+To reproduce the language modeling results, please refer to the following shell scripts in the [`scripts/`](./scripts) directory:
+- Species10: [`train_ten_species_guidance.sh`](./scripts/train_ten_species_guidance.sh)
+- QM9: [`train_qm9_no-guidance.sh`](./scripts/train_qm9_no-guidance.sh)
+- CIFAR10: [`train_cifar10_unet_guidance.sh`](./scripts/train_cifar10_unet_guidance.sh)
+- text8: [`train_text8.sh`](./scripts/train_text8.sh)
+- Amazon Polarity: [`train_amazon_polarity.sh`](./scripts/train_amazon_polarity.sh)
+- LM1B: [`train_lm1b.sh`](./scripts/train_lm1b.sh)
+Each script contains a comment detailing the usage.
+For example, to train either an AR,
+MDLM, or UDLM model on the `text8` dataset, use the following command:
+```bash
+cd scripts/
+MODEL=<ar|mdlm|udlm>
+sbatch \
+  --export=ALL,MODEL=${MODEL} \
+  --job-name=train_text8_${MODEL} \
+  train_text8.sh
+```
+### Guidance Training
+<a name="guidance-training"></a>
+#### Classifier-Free
+<a name="guidance-training-cfg"></a>
+For classifier-free guidance we require training models
+that can condition on the class label
+to model conditional distributions,
+and we randomly mask out the signal,
+replacing it with a dummy value of `num_claseses + 1`, to simulate an unconditional model.
+Refer to the shell scripts with the `_guidance` suffix
+to train these models for CIFAR10,
+QM9, and Species10 datasets.
+For QM9, we have two experiments,
+one where we condition on the drug-likeness
+(`qed`)
+of the molecules and another
+where we condition on the ring counts (`ring_count`).
+#### Classifier-Based
+<a name="guidance-training-cbg"></a>
+For classifier-based guidance,
+we need to train a classifier on the noisy latent samples.
+Refer to the following shell scripts
+to train these classifiers:
+- [FUDGE](https://arxiv.org/abs/2104.05218) (AR guidance): [`train_qm9_fudge_classifier.sh`](./scripts/train_qm9_fudge_classifier.sh)
+- D-CBG (diffusion guidance): [`train_qm9_classifier.sh`](./scripts/train_qm9_classifier.sh)
+##### PPLM / NOS baselines
+An alternative classifier-based guidance mechanism to D-CBG is that of [PPLM](https://arxiv.org/abs/1912.02164)
+(which was adapted for diffusion models in [NOS](https://arxiv.org/abs/2305.20009)).
+To train these classifiers,
+refer to the following shell script:
+[`train_qm9_pplm_classifier.sh`](./scripts/train_qm9_pplm_classifier.sh)
+(for both PPLM and NOS classifiers).
+### Guidance Evaluation
+<a name="guidance-eval"></a>
+To evaluate guidance mechanisms, we load trained models
+(and classifiers, if applicable)
+and generate some number of samples
+for which we compute "quality" metrics
+(e.g., validity/novelty in the QM9 experiments)
+and control label satisfaction (e.g., mean value of novel generated molecules for the property of interest in the QM9 experiments).
+The scripts for these evaluations can be found in the [`guidance_eval/`](./guidance_eval) directory.
+To run these evaluations, please refer to the following shell scripts:
+- QM9: [`eval_qm9_guidance.sh`](./guidance_eval/eval_qm9_guidance.sh)
+- Species10: [`eval_ten_species_guidance.sh`](./guidance_eval/eval_ten_species_guidance.sh)
+  - For this dataset, we also evaluate the accuracy of a HyenaDNA classifier on correctly classifying generated sequences.
+    This model can be trained using [`train_ten_species_eval_classifier.sh`](./scripts/train_ten_species_eval_classifier.sh).
+    - To see how this trained evaluation classifier performs on the validation set of the original data use this notebook [`eval_hyenadna_classifier.ipynb`](./notebooks/eval_hyenadna_classifier.ipynb).
+In the paper,
+we performed an extensive hyperparameter sweep for our proposed guidance mechanisms and for baselines.
+The shell scripts can be used
+to reproduce these experiments,
+e.g., for the D-CFG experiments on QM9:
+```bash
+export MODEL=<ar|mdlm|udlm>
+export PROP=<qed|ring_count>
+export GUIDANCE=cfg
+for GAMMA in $(seq 1 5); do
+    sbatch \
+      --export=ALL,MODEL=${MODEL},PROP=${PROP},GUIDANCE=${GUIDANCE},GAMMA=${GAMMA} \
+      --job-name=eval_qm9_${GUIDANCE}_${PROP}_${MODEL}_GAMMA-${GAMMA} \
+      eval_qm9_guidance.sh
+done
+```
+Once each evaluation run is complete,
+a `.csv` file
+containing the results is saved in the run directory of the trained generative model.
+## Using HuggingFace Models
+<a name="hf_models"></a>
+We provide pre-trained models on HuggingFace 🤗:
+- UDLM trained on LM1B: [kuleshov-group/udlm-lm1b](https://huggingface.co/kuleshov-group/udlm-lm1b)
+- UDLM trained on QM9: [kuleshov-group/udlm-qm9](https://huggingface.co/kuleshov-group/udlm-qm9)
+  - Note: this model was trained without guidance and can be used with classifier-free guidance.
+Please see the README pages for these models on HuggingFace or our paper for more details about the training of these models.
+To use these models, you can load them using the HuggingFace API, e.g.,
+```python
+from transformers import AutoModelForMaskedLM
+model = AutoModelForMaskedLM.from_pretrained("kuleshov-group/udlm-lm1b")
+```
+To use these models in our repository, set the following `config` parameters:
+```bash
+backbone="hf_dit"
+model="hf"
+model.pretrained_model_name_or_path="kuleshov-group/udlm-lm1b"  # or "kuleshov-group/udlm-qm9"
+```
+## Acknowledgements
+<a name="acknowledgements"></a>
+This repository was built off of [MDLM](https://github.com/kuleshov-group/mdlm),
+which in used [SEDD](https://github.com/louaaron/Score-Entropy-Discrete-Diffusion).
+Our code implementation of D-CBG is adapted from Nisonoff et al.'s [repo](https://github.com/hnisonoff/discrete_guidance).
+## Citation
+<a name="citation"></a>
+```
+@article{
+    schiff2024discreteguidance,
+    title={Simple Guidance Mechanisms for Discrete Diffusion Models},
+    author={Schiff, Yair and Sahoo, Subham Sekhar and Phung, Hao and Wang, Guanghan and Boshar, Sam and Dalla-torre, Hugo and de Almeida, Bernardo P and Rush, Alexander and Pierrot, Thomas and Kuleshov, Volodymyr},
+    journal={arXiv preprint arXiv:2412.10193},
+    year={2024}
+}
+```

muppit/__pycache__/classifier.cpython-310.pyc ADDED Viewed

Binary file (13.8 kB). View file

muppit/__pycache__/dataloader.cpython-310.pyc ADDED Viewed

Binary file (18 kB). View file

muppit/__pycache__/diffusion.cpython-310.pyc ADDED Viewed

Binary file (33.4 kB). View file

muppit/__pycache__/noise_schedule.cpython-310.pyc ADDED Viewed

Binary file (6.19 kB). View file

muppit/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (2.77 kB). View file

muppit/classifier.py ADDED Viewed

	@@ -0,0 +1,490 @@

+import itertools
+import typing
+import hydra.utils
+import lightning as L
+import torch
+import torch.nn.functional as F
+import torchmetrics
+import transformers
+import dataloader
+import models.dit
+import noise_schedule
+class MicroAveragingMetric(torchmetrics.Metric):
+  """Micro-averaging metric.
+    Adapted from https://github.com/HazyResearch/hyena-dna/blob/main/src/tasks/metrics.py#L12
+  """
+  def __init__(self, class_idx: typing.Optional[int] = 1,
+               dist_sync_on_step=False):
+    super().__init__(dist_sync_on_step=dist_sync_on_step)
+    self.class_idx = torch.tensor(class_idx) \
+      if class_idx is not None else None
+    self.add_state("numerator", default=torch.tensor(0.0),
+                   dist_reduce_fx="sum")
+    self.add_state("denominator", default=torch.tensor(0.0),
+                   dist_reduce_fx="sum")
+  def _update(
+      self, numerator, denominator, preds, y) -> tuple:
+    raise NotImplementedError
+  def update(self, logits: torch.Tensor, y: torch.Tensor):
+    # update metric states
+    preds = torch.argmax(logits, dim=-1)
+    y = y.view(-1)
+    assert preds.shape == y.shape, \
+      f"preds shape {preds.shape} != y shape {y.shape}"
+    self.numerator, self.denominator = self._update(
+      self.numerator, self.denominator, preds, y)
+  def compute(self):
+    # compute final result
+    value = self.numerator.float() / self.denominator \
+      if self.denominator.item() > 0. else torch.tensor(0.0)
+    return value
+  def reset(self):
+    self.numerator = torch.tensor(0.0).to(self.device)
+    self.denominator = torch.tensor(0.0).to(self.device)
+class CrossEntropy(MicroAveragingMetric):
+  """Calculates cross-entropy loss."""
+  def _update(
+      self, numerator, denominator, logits, y) -> tuple:
+    with torch.no_grad():
+      numerator += F.cross_entropy(
+        logits.view(-1, logits.size(-1)),
+        y.view(-1),
+        ignore_index=-100,
+        reduction='sum')
+      denominator += y.numel()
+    return numerator, denominator
+  # Overrides parent class to use logits and not (argmax) preds
+  def update(self, logits: torch.Tensor, y: torch.Tensor):
+    y = y.view(-1)
+    self.numerator, self.denominator = self._update(
+      self.numerator, self.denominator, logits, y)
+class Accuracy(MicroAveragingMetric):
+  """Calculates accuracy.
+    Can be used to calculate accuracy per class.
+    Copied from:
+      https://github.com/HazyResearch/hyena-dna/blob/main/src/tasks/metrics.py
+  """
+  def _update(
+      self, numerator, denominator, preds, y) -> tuple:
+    if self.class_idx is None:
+      numerator += (preds == y).sum()
+      denominator += y.numel()
+    else:
+      class_idx = self.class_idx
+      relevant_idxs = (y == class_idx)
+      numerator += (preds[relevant_idxs] == class_idx).sum()
+      denominator += relevant_idxs.sum()
+      relevant_idxs = (y != class_idx)
+      numerator += (preds[relevant_idxs] != class_idx).sum()
+      denominator += relevant_idxs.sum()
+    return numerator, denominator
+class Precision(MicroAveragingMetric):
+  """Calculates precision.
+    Can be used to calculate precision per class.
+    Adapted from:
+      https://github.com/HazyResearch/hyena-dna/blob/main/src/tasks/metrics.py
+  """
+  def _update(self, numerator, denominator, preds, y) -> tuple:
+    class_idx = self.class_idx
+    relevant_idxs = (preds == class_idx)
+    numerator += (y[relevant_idxs] == class_idx).sum()
+    denominator += relevant_idxs.sum()
+    return numerator, denominator
+class Recall(MicroAveragingMetric):
+  """Calculate recall.
+    Can be used to calculate recall per class.
+    Adapted from:
+      https://github.com/HazyResearch/hyena-dna/blob/main/src/tasks/metrics.py
+  """
+  def _update(self, numerator, denominator, preds, y) -> tuple:
+    class_idx = self.class_idx
+    relevant_idxs = (y == class_idx)
+    numerator += (preds[relevant_idxs] == class_idx).sum()
+    denominator += relevant_idxs.sum()
+    return numerator, denominator
+class Classifier(L.LightningModule):
+  def __init__(
+      self,
+      config,
+      tokenizer: transformers.PreTrainedTokenizer,
+      pretrained_backbone: typing.Optional[torch.nn.Module] = None):
+    super().__init__()
+    self.save_hyperparameters(ignore=['pretrained_backbone'])
+    self.config = config
+    # This param indicates whether this model will be used
+    #  for guidance (False) or only evaluation (True).
+    self.is_eval_classifier = getattr(
+      config, 'is_eval_classifier', False)
+    self.tokenizer = tokenizer
+    self.vocab_size = tokenizer.vocab_size
+    self.antithetic_sampling = config.training.antithetic_sampling
+    self.importance_sampling = config.training.importance_sampling
+    self.change_of_variables = config.training.change_of_variables
+    if (not hasattr(self.tokenizer, 'mask_token')
+        or self.tokenizer.mask_token is None):
+      self.mask_index = self.vocab_size
+      self.vocab_size += 1
+    else:
+      self.mask_index = self.tokenizer.mask_token_id
+    if config.classifier_backbone == 'dit':
+      self.classifier_model = models.dit.DITClassifier(
+        self.config, vocab_size=self.vocab_size)
+    elif self.config.classifier_backbone == 'dimamba':
+      self.classifier_model = models.dimamba.DiMambaClassifier(
+        self.config, vocab_size=self.vocab_size,
+        pad_token_id=self.tokenizer.pad_token_id)
+    elif config.classifier_backbone == 'hyenadna':
+      hyena_config = transformers.AutoConfig.from_pretrained(
+        config.classifier_model.hyena_model_name_or_path,
+        n_layer=config.classifier_model.n_layer,
+        trust_remote_code=True
+      )
+      self.classifier_model = transformers.AutoModelForSequenceClassification.from_config(
+        hyena_config,
+        pretrained=False,
+        num_labels=config.data.num_classes,
+        problem_type='single_label_classification',
+        trust_remote_code=True
+      )
+    else:
+      raise NotImplementedError(
+        f"Classifier backbone "
+        f"{self.config.classifier_backbone} not "
+        f"implemented.")
+    if pretrained_backbone is not None:  # For PPLM / NOS
+      self.classifier_model.load_pretrained_encoder(
+        pretrained_backbone)
+    # Metrics are automatically reset at end of epoch
+    metrics = torchmetrics.MetricCollection({
+      'cross_entropy': CrossEntropy(),
+      'accuracy': Accuracy(class_idx=None),
+    })
+    if config.data.num_classes > 2:
+      for c in range(config.data.num_classes):
+        metrics.add_metrics(
+          {f"accuracy_class{c}": Accuracy(class_idx=c),
+           f"precision_class{c}": Precision(class_idx=c),
+           f"recall_class{c}": Recall(class_idx=c)})
+    else:
+      metrics.add_metrics(
+        {'precision': Precision(class_idx=1),
+         'recall': Recall(class_idx=1)})
+    metrics.set_dtype(torch.float64)
+    self.train_metrics = metrics.clone(prefix='train/')
+    self.valid_metrics = metrics.clone(prefix='val/')
+    self.T = config.T
+    self.noise = noise_schedule.get_noise(config,
+                                          dtype=self.dtype)
+    self.sampling_eps = config.training.sampling_eps
+    self.lr = config.optim.lr
+    self.time_conditioning = config.time_conditioning
+    self.fast_forward_epochs = None
+    self.fast_forward_batches = None
+  def on_load_checkpoint(self, checkpoint):
+    # Copied from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py#L41
+    self.fast_forward_epochs = checkpoint['loops'][
+      'fit_loop']['epoch_progress']['current']['completed']
+    self.fast_forward_batches = checkpoint['loops'][
+      'fit_loop']['epoch_loop.batch_progress'][
+      'current']['completed']
+  def on_save_checkpoint(self, checkpoint):
+    # Copied from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/tasks/seq.py
+    # ['epoch_loop.batch_progress']['total']['completed'] is
+    #  1 iteration behind, so we're using the optimizer's
+    #  progress.
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.batch_progress']['total'][
+      'completed'] = checkpoint['loops']['fit_loop'][
+                       'epoch_loop.automatic_optimization.optim_progress'][
+                       'optimizer']['step']['total'][
+                       'completed'] * self.trainer.accumulate_grad_batches
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.batch_progress']['current'][
+      'completed'] = checkpoint['loops']['fit_loop'][
+                       'epoch_loop.automatic_optimization.optim_progress'][
+                       'optimizer']['step']['current'][
+                       'completed'] * self.trainer.accumulate_grad_batches
+    # _batches_that_stepped tracks the number of global
+    # steps, not the number of local steps, so we don't
+    # multiply with self.trainer.accumulate_grad_batches
+    # here.
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.state_dict'][
+      '_batches_that_stepped'] = \
+    checkpoint['loops']['fit_loop'][
+      'epoch_loop.automatic_optimization.optim_progress'][
+      'optimizer']['step']['total']['completed']
+    if 'sampler' not in checkpoint.keys():
+      checkpoint['sampler'] = {}
+    if hasattr(self.trainer.train_dataloader.sampler,
+               'state_dict'):
+      sampler_state_dict = self.trainer. \
+        train_dataloader.sampler.state_dict()
+      checkpoint['sampler'][
+        'random_state'] = sampler_state_dict.get(
+        'random_state', None)
+    else:
+      checkpoint['sampler']['random_state'] = None
+  def on_train_start(self):
+    # Adapted from:
+    # https://github.com/Dao-AILab/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
+    distributed = (
+        self.trainer._accelerator_connector.use_distributed_sampler
+        and self.trainer._accelerator_connector.is_distributed)
+    if distributed:
+      sampler_cls = dataloader.FaultTolerantDistributedSampler
+    else:
+      sampler_cls = dataloader.RandomFaultTolerantSampler
+    updated_dls = []
+    for dl in self.trainer.fit_loop._combined_loader.flattened:
+      if hasattr(dl.sampler, 'shuffle'):
+        dl_sampler = sampler_cls(
+          dl.dataset, shuffle=dl.sampler.shuffle)
+      else:
+        dl_sampler = sampler_cls(dl.dataset)
+      if (distributed
+          and self.fast_forward_epochs is not None
+          and self.fast_forward_batches is not None):
+        dl_sampler.load_state_dict({
+          'epoch': self.fast_forward_epochs,
+          'counter': (self.fast_forward_batches
+                      * self.config.loader.batch_size)})
+      updated_dls.append(
+        torch.utils.data.DataLoader(
+          dl.dataset,
+          batch_size=self.config.loader.batch_size,
+          num_workers=self.config.loader.num_workers,
+          pin_memory=self.config.loader.pin_memory,
+          sampler=dl_sampler,
+          shuffle=False,
+          persistent_workers=self.config.loader.persistent_workers
+        ))
+    self.trainer.fit_loop._combined_loader.flattened = updated_dls
+  def forward(self, x, sigma=None, x_emb=None, attention_mask=None):
+    """Returns logits.
+      x_emb can be provided during PPLM / NoS-style guidance
+      (see: https://arxiv.org/abs/2305.20009).
+    """
+    if self.is_eval_classifier:
+      logits = self.classifier_model(x)
+      if hasattr(logits, 'logits'):
+        logits = logits.logits
+    else:
+      sigma = self._process_sigma(sigma) if sigma is not None else sigma
+      with torch.cuda.amp.autocast(dtype=torch.float32):
+        logits = self.classifier_model(x, sigma, x_emb=x_emb, attention_mask=attention_mask)
+    return logits
+  def get_log_probs(self, x, sigma, x_emb=None):
+    """Returns log probabilities.
+      Use for CBG-style guidance.
+    """
+    if self.is_eval_classifier:
+      raise NotImplementedError(
+        '`get_log_prob` not implemented for classifiers '
+        'that are meant to be used for evaluation purposes '
+        'only.')
+    with torch.cuda.amp.autocast(dtype=torch.float32):
+      return torch.nn.functional.log_softmax(
+        self.forward(x, sigma, x_emb=x_emb), dim=-1)
+  def training_step(self, batch, batch_idx):
+    loss = self._compute_loss(batch, prefix='train')
+    self.log(name='trainer/loss',
+             value=loss.item(),
+             on_step=True,
+             on_epoch=False,
+             sync_dist=True,
+             prog_bar=True)
+    self.log(name='lr',
+             value=
+             self.trainer.optimizers[0].param_groups[0][
+               'lr'],
+             on_step=True,
+             on_epoch=False,
+             sync_dist=True,
+             prog_bar=True, logger=False)
+    return loss
+  def validation_step(self, batch, batch_idx):
+    return self._compute_loss(batch, prefix='val')
+  def configure_optimizers(self):
+    # TODO(yair): Lightning currently giving this warning when using `fp16`:
+    #  "Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+    #  Not clear if this is a problem or not.
+    #  See: https://github.com/Lightning-AI/pytorch-lightning/issues/5558
+    optimizer = torch.optim.AdamW(
+      itertools.chain(self.classifier_model.parameters(),
+                      self.noise.parameters()),
+      lr=self.config.optim.lr,
+      betas=(self.config.optim.beta1,
+             self.config.optim.beta2),
+      eps=self.config.optim.eps,
+      weight_decay=self.config.optim.weight_decay)
+    scheduler = hydra.utils.instantiate(
+      self.config.lr_scheduler, optimizer=optimizer)
+    scheduler_dict = {
+      'scheduler': scheduler,
+      'interval': 'step',
+      'monitor': 'val/loss',
+      'name': 'trainer/lr',
+    }
+    return [optimizer], [scheduler_dict]
+  def _q_xt(self, x, move_chance):
+    """Computes the noisy sample xt.
+    Args:
+      x: int torch.Tensor with shape (batch_size,
+          diffusion_model_input_length), input.
+      move_chance: float torch.Tensor with shape
+        (batch_size, 1).
+    """
+    move_indices = torch.rand(
+      *x.shape, device=x.device) < move_chance
+    if self.config.diffusion == 'absorbing_state':
+      return torch.where(move_indices, self.mask_index, x)
+    if self.config.diffusion == 'uniform':
+      uniform_tensor = torch.randint(
+        0, self.vocab_size, x.shape, device=x.device)
+      return torch.where(move_indices, uniform_tensor, x)
+    raise NotImplementedError(
+        f'Diffusion type {self.config.diffusion} not '
+        'implemented.')
+  def _compute_loss(self, batch, prefix):
+    x0 = batch['input_ids']
+    attention_mask = batch['attention_mask']
+    t = None
+    if self.is_eval_classifier:
+      logits = self.forward(x0)
+    elif self.config.parameterization == 'ar':
+      # do not add noise for AR FUDGE and AR PPLM
+      logits = self.forward(
+        x0, attention_mask=attention_mask)
+    else:
+      t = self._sample_t(x0.shape[0])
+      if self.T > 0:
+        t = (t * self.T).to(torch.int)
+        t = t / self.T
+        # t \in {1/T, 2/T, ..., 1}
+        t += (1 / self.T)
+      if self.change_of_variables:
+        time_conditioning = t[:, None]
+        f_T = torch.log1p(- torch.exp(- self.noise.sigma_max))
+        f_0 = torch.log1p(- torch.exp(- self.noise.sigma_min))
+        move_chance = torch.exp(f_0 + t * (f_T - f_0))
+        move_chance = move_chance[:, None]
+      else:
+        sigma, _ = self.noise(t)
+        time_conditioning = sigma[:, None]
+        move_chance = 1 - torch.exp(-sigma[:, None])
+      xt = self._q_xt(x0, move_chance)
+      logits = self.forward(xt, time_conditioning, attention_mask=attention_mask)
+    if hasattr(self.config.data, 'label_col'):
+      if f"{self.config.data.label_col}_threshold" in batch:
+        y = batch[f"{self.config.data.label_col}_threshold"]
+      else:
+        y = batch[self.config.data.label_col]
+    else:
+      y = batch['label']
+    if (not self.is_eval_classifier
+        and getattr(self.config.training, 'use_label_smoothing', False)):
+      # Interpolate between one-hot and uniform distribution
+      labels = (torch.nn.functional.one_hot(y, self.config.data.num_classes) * (1 - t)[..., None] +
+           (1 / self.config.data.num_classes) * t[..., None])
+    else:
+      labels = y.view(-1)
+    if getattr(self.config, 'is_fudge_classifier', False):
+      expanded_y = y.unsqueeze(1).expand(-1, logits.shape[1]) # batch x seq
+      logits = logits.view(-1, self.config.data.num_classes)[attention_mask.flatten()==1, ...]
+      y = expanded_y.flatten().long()[attention_mask.flatten()==1]
+      loss = torch.nn.functional.cross_entropy(
+        logits,
+        y,
+        ignore_index=-100,
+        reduction='mean')
+    else:
+      loss = torch.nn.functional.cross_entropy(
+        logits.view(-1, logits.size(-1)),
+        labels,
+        ignore_index=-100,
+        reduction='mean')
+    if prefix == 'train':
+      self.train_metrics.update(logits, y)
+      metrics = self.train_metrics
+    elif prefix == 'val':
+      self.valid_metrics.update(logits, y)
+      metrics = self.valid_metrics
+    elif prefix == 'test':
+      self.test_metrics.update(logits, y)
+      metrics = self.test_metrics
+    else:
+      raise ValueError(f'Invalid prefix: {prefix}')
+    self.log_dict(metrics,
+                  on_step=False,
+                  on_epoch=True,
+                  sync_dist=True)
+    return loss
+  def _sample_t(self, n):
+    _eps_t = torch.rand(n, device=self.device)
+    if self.antithetic_sampling:
+      offset = torch.arange(n, device=self.device) / n
+      _eps_t = (_eps_t / n + offset) % 1
+    t = (1 - self.sampling_eps) * _eps_t + self.sampling_eps
+    if self.importance_sampling:
+      return self.noise.importance_sampling_transformation(
+        t)
+    return t
+  def _process_sigma(self, sigma):
+    if sigma.ndim > 1:
+      sigma = sigma.squeeze(-1)
+    if not self.time_conditioning:
+      sigma = torch.zeros_like(sigma)
+    assert sigma.ndim == 1, sigma.shape
+    return sigma

muppit/configs/callbacks/checkpoint_every_n_steps.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+checkpoint_every_n_steps:
+  _target_: lightning.pytorch.callbacks.ModelCheckpoint
+  save_top_k: -1 # Do not save any "best" models; this callback is being used to save every n train steps
+  save_last: True # save model as ${save_dir}/checkpoints/last.ckpt
+  dirpath: ${checkpointing.save_dir}/checkpoints
+  verbose: True
+  auto_insert_metric_name: False
+  # every_n_train_steps: 500

muppit/configs/callbacks/checkpoint_monitor.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+checkpoint_monitor:
+  _target_: lightning.pytorch.callbacks.ModelCheckpoint
+  monitor: val/nll # name of the logged metric which determines when model is improving
+  mode: min # can be "max" or "min"
+  save_top_k: 1 # save k best models (determined by above metric)
+  save_last: False # True = additionally always save model from last epoch
+  dirpath: ${checkpointing.save_dir}/checkpoints
+  filename: best
+  auto_insert_metric_name: False
+  verbose: True

muppit/configs/callbacks/learning_rate_monitor.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+learning_rate_monitor:
+  _target_: lightning.pytorch.callbacks.LearningRateMonitor
+  logging_interval: step

muppit/configs/classifier_model/dimamba-classifier.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: dimamba
+type: dimamba
+hidden_size: 256
+cond_dim: 128
+length: ${model.length}  # Same length as diffusion model
+n_blocks: 8
+scale_by_sigma: True
+dropout: 0.1
+tie_word_embeddings: False
+bidirectional: True,
+bidirectional_strategy: add
+bidirectional_weight_tie: True
+num_classes: ${data.num_classes}
+pooling: mean

muppit/configs/classifier_model/hyenadna-classifier.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+name: hyena-32k
+type: hyenadna
+hyena_model_name_or_path: ???
+n_layer: 4

muppit/configs/classifier_model/small-classifier.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+name: small
+type: ddit
+hidden_size: 768
+cond_dim: 128
+length: ${model.length}  # Same length as diffusion model
+n_blocks: 12
+n_heads: 12
+scale_by_sigma: True
+dropout: 0.1
+num_classes: ${data.num_classes}
+pooling: mean

muppit/configs/classifier_model/tiny-classifier.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+name: tiny
+type: ddit
+hidden_size: 512
+cond_dim: 128
+length: ${model.length}  # Same length as diffusion model
+n_blocks: 8
+n_heads: 8
+scale_by_sigma: True
+dropout: 0.1
+num_classes: ${data.num_classes}
+pooling: mean

muppit/configs/classifier_model/tiny-dimamba-classifier.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: tiny
+type: dimamba
+hidden_size: 128
+cond_dim: 128
+length: ${model.length}  # Same length as diffusion model
+n_blocks: 4
+scale_by_sigma: True
+dropout: 0.1
+tie_word_embeddings: False
+bidirectional: True,
+bidirectional_strategy: add
+bidirectional_weight_tie: True
+num_classes: ${data.num_classes}
+pooling: mean

muppit/configs/config.yaml ADDED Viewed

	@@ -0,0 +1,104 @@

+defaults:
+  - _self_
+  - /callbacks: [checkpoint_every_n_steps, checkpoint_monitor, learning_rate_monitor]
+  - /data: peptide
+  - /model: small
+  - /strategy: ddp
+  - /noise: loglinear
+  - /lr_scheduler: cosine_decay_warmup  # constant_warmup
+  - /classifier_model: null
+  - /guidance: cbg
+mode: ppl_eval  # train / train_classifier / ppl_eval
+diffusion: uniform  # absorbing_state / uniform
+backbone: dit  # dit / dimamba / ar
+classifier_backbone: null
+parameterization: d3pm  # subs / d3pm / ar
+time_conditioning: True  # UDLM is conditioned on time
+subs_masking: False
+zero_recon_loss: True  # Use for UDLM
+T: 0  # 0 (continuous time) / 1000
+is_vision: False
+seed: 13
+loader:
+  global_batch_size: 512
+  eval_global_batch_size: ${.global_batch_size}
+  # Note: batch_size and eval_batch_size are **per machine**
+  batch_size: ${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
+  eval_batch_size: ${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
+  num_workers: 0 # ${eval:"len(__import__('os').sched_getaffinity(0))"}
+  pin_memory: True
+  persistent_workers: False # True
+sampling:
+  use_cache: True
+  steps: 128
+  # Note: batch_size is **per machine**
+  batch_size: 1 # ${loader.eval_batch_size}
+  num_sample_batches: 50  # Total samples: `num_gpus` * `batch_size` * `num_sample_batches`
+  use_float64: False
+eval:
+  checkpoint_path: '/home/tc415/muPPIt_embedding/muppit/model_path/PeptideUDLM.ckpt'  # Used to evaluate a checkpoint after training.
+  wildtype: 'MAEYLASIFGTEKDKVNCSFYFKIGACRHGDRCSRLHNKPTFSQTIALLNIYRNPQNSSQSADGLRCAVSDVEMQEHYDEFFEEVFTEMEEKYGEVEEMNVCDNLGDHLVGNVYVKFRREEDAEKAVIDLNNRWFNGQPIHAELSPVTDFREACCRQYEMGECTRGGFCNFMHLKPISRELRRELYGRRRKKHRSRSRSRERRSRSRDRGRGGGGGGGGGGGGRERDRRRSRDRERSGRF'
+  mutant: 'MAEYLASIFGTEKDKVNCSFYFKIGACRHGDRCFRLHNKPTFSQTIALLNIYRNPQNSSQSADGLRCAVSDVEMQEHYDEFFEEVFTEMEEKYGEVEEMNVCDNLGDHLVGNVYVKFRREEDAEKAVIDLNNRWFNGQPIHAELSPVTDFREACCRQYEMGECTRGGFCNFMHLKPISRELRRELYGRRRKKHRSRSRSRERRSRSRDRGRGGGGGGGGGGGGRERDRRRSRDRERSGRF'
+  disable_ema: False
+  generate_samples: True
+  generated_samples_path: ''
+  max_samples: 50_000
+training:
+  ema: 0.9999
+  antithetic_sampling: True
+  importance_sampling: False
+  sampling_eps: 1e-3
+  change_of_variables: False
+  compute_loss_on_pad_tokens: True
+  use_simple_ce_loss: False  # Ignore ELBO; just use CE
+  guidance: null # Can turn off with `training.guidance: null`
+    # cond_dropout: 0.0
+optim:
+  weight_decay: 1e-4
+  lr: 1e-5
+  beta1: 0.9
+  beta2: 0.999
+  eps: 1e-8
+trainer:
+  _target_: lightning.Trainer
+  accelerator: cuda
+  num_nodes: 1
+  devices: 2 # ${device_count:}
+  accumulate_grad_batches: 1 # ${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}}
+  gradient_clip_val: 1.0
+  precision: 'bf16-mixed'
+  num_sanity_val_steps: 2
+  # max_epochs: 10
+  max_steps: 1652000
+  log_every_n_steps: 100
+  limit_train_batches: 1.0   # train on full dataset, can be used to toggle quick run
+  limit_val_batches: 1.0     # validate on full dataset, can be used to toggle quick run
+  val_check_interval: 16520 # 2545
+wandb:
+  project: moPPIt-v2
+  job_type: model-training
+  name: protein_medium_100epochs_lr1e-5_gradclip1_wd1e-4_dropout0.1 #epochs10_lr3e-4_bsz8_64-true_all-params_gradclip1_beta-one0.9_beta-two0.999
+  id: ${.name}
+hydra:
+  run:
+    dir: ./outputs/${wandb.name} # ./outputs/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S}
+  job:
+    chdir: true
+checkpointing:
+  # Use custom `save_dir` if, e.g., saving to S3 bucket, otherwise leave this parameter as is
+  save_dir: ${cwd:}
+  # Note: `checkpoints` path should correspond to `checkpoint_every_n_steps.dirpath`
+  resume_from_ckpt: False
+  resume_ckpt_path: ${.save_dir}/checkpoints/last.ckpt

muppit/configs/data/amazon_polarity.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+train: amazon_polarity
+valid: amazon_polarity
+tokenizer_name_or_path: bert-base-uncased
+cache_dir: /share/kuleshov/ssahoo/textdiffusion/data
+wrap: False
+streaming: False
+override_cache: False
+add_special_tokens: True
+label_col: label
+num_classes: 2

muppit/configs/data/cifar10.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+train: ???  # (Local) Path to CIFAR-10 training data
+valid: ???  # (Local) Path to CIFAR-10 validation data
+label_col: labels
+num_classes: 10
+streaming: False
+size: 1024
+length: 3072
+add_special_tokens: True
+add_mask_token: True
+tokenizer_name_or_path: raw_pixels

muppit/configs/data/lm1b.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+train: lm1b
+valid: lm1b
+tokenizer_name_or_path: bert-base-uncased
+cache_dir: /share/kuleshov/ssahoo/textdiffusion/data
+wrap: False
+streaming: False
+override_cache: False
+add_special_tokens: True

muppit/configs/data/peptide.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+train: peptide
+valid: peptide
+tokenizer_name_or_path: facebook/esm2_t33_650M_UR50D
+cache_dir: /home/tc415/discrete-diffusion-guidance/dataset
+wrap: False
+streaming: False
+override_cache: False
+add_special_tokens: True

muppit/configs/data/protein.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+train: protein_400k
+valid: protein_400k
+tokenizer_name_or_path: facebook/esm2_t33_650M_UR50D
+cache_dir: /home/tc415/discrete-diffusion-guidance/dataset
+wrap: False
+streaming: False
+override_cache: False
+add_special_tokens: True

muppit/configs/data/qm9.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+train: qm9
+valid: qm9
+tokenizer_name_or_path: yairschiff/qm9-tokenizer
+cache_dir: /share/kuleshov/ssahoo/textdiffusion/data
+wrap: False
+streaming: False
+override_cache: False
+add_special_tokens: True
+label_col: qed
+label_col_pctile: 90
+num_classes: 2

muppit/configs/data/ten_species.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+train: ten_species
+valid: ten_species
+tokenizer_name_or_path: kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16
+cache_dir: /share/kuleshov/ssahoo/textdiffusion/data
+wrap: False
+streaming: False
+override_cache: False
+add_special_tokens: False
+label_col: species_label
+num_classes: 10
+rc_aug: False

muppit/configs/data/text8.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+# TODO: When using this dataset, set model.length = 256 to match D3PM setup
+train: text8
+valid: text8
+tokenizer_name_or_path: text8
+cache_dir: /share/kuleshov/ssahoo/textdiffusion/data
+wrap: True
+streaming: False
+override_cache: False
+add_special_tokens: False

muppit/configs/guidance/cbg.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+method: cbg
+condition: 0
+classifier_checkpoint_path: '/home/tc415/muPPIt_embedding/checkpoints/mutBind_small'
+gamma: 2.0
+use_approx: False  # use first-order approximation

muppit/configs/guidance/cfg.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+method: cfg
+condition: 0
+gamma: 1.0

muppit/configs/guidance/fudge.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+method: fudge
+condition: 0
+classifier_checkpoint_path: ''
+topk: 20
+gamma: 1.0

muppit/configs/guidance/nos.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+method: nos
+condition: 0
+classifier_checkpoint_path: ''
+num_nos_steps: 1
+nos_step_size: 0.1
+nos_stability_coef: 0.01

muppit/configs/guidance/pplm.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+method: pplm
+condition: 0
+classifier_checkpoint_path: ''
+num_pplm_steps: 1
+pplm_step_size: 0.1
+pplm_stability_coef: 0.01

muppit/configs/lr_scheduler/constant_warmup.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ _target_: transformers.get_constant_schedule_with_warmup
2	+ num_warmup_steps: 2500

muppit/configs/lr_scheduler/cosine_decay_warmup.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+_target_: utils.CosineDecayWarmupLRScheduler
+t_in_epochs: False
+t_initial: ${eval:${trainer.max_steps}-${.warmup_t}}
+warmup_prefix: True
+warmup_lr_init: 1e-7
+warmup_t: ${eval:0.1*${trainer.max_steps}}
+lr_min: 1e-7

muppit/configs/model/dimamba.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+name: dimamba
+type: dimamba
+hidden_size: 256
+cond_dim: 128
+length: 32768
+n_blocks: 8
+scale_by_sigma: True
+dropout: 0.1
+tie_word_embeddings: False
+bidirectional: True,
+bidirectional_strategy: add
+bidirectional_weight_tie: True

muppit/configs/model/fudge_predictor.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+name: fudge_predictor
+type: lstm
+hidden_dim: 300
+length: 1024

muppit/configs/model/hf.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ pretrained_model_name_or_path: null
2	+ length: 128

muppit/configs/model/medium.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+name: medium
+type: ddit
+hidden_size: 1024
+cond_dim: 128
+length: 82
+n_blocks: 24
+n_heads: 16
+scale_by_sigma: True
+dropout: 0.1
+tie_word_embeddings: False

muppit/configs/model/small.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+name: small
+type: ddit
+hidden_size: 768
+cond_dim: 128
+length: 15
+# length_range: '6-49'
+n_blocks: 12
+n_heads: 12
+scale_by_sigma: True
+dropout: 0.1
+tie_word_embeddings: False

muppit/configs/model/tiny.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+name: tiny
+type: ddit
+hidden_size: 512
+cond_dim: 128
+length: 1024
+n_blocks: 8
+n_heads: 8
+scale_by_sigma: True
+dropout: 0.1
+tie_word_embeddings: False

muppit/configs/model/unet.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: unet
+type: unet
+ch: 128
+num_res_blocks: 2
+num_scales: 4
+ch_mult: [1, 2, 2, 2]
+input_channels: 3
+output_channels: -1 # determined by vocab_size
+scale_count_to_put_attn: 1 # at 16 res
+data_min_max: [0, 255] # No need currently
+dropout: 0.1
+skip_rescale: True
+time_conditioning: True # Whether to add in time embeddings
+time_scale_factor: 1000
+time_embed_dim: ${.ch}
+fix_logistic: False
+size: ${data.size}
+cond_dim: ${.ch}
+length: ${data.length}

muppit/configs/model/unet_campbell.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: unet
+type: unet
+ch: 128
+num_res_blocks: 2
+num_scales: 4
+ch_mult: [1, 2, 2, 2]
+input_channels: 3
+output_channels: -1 # determined by input_channels * 2
+scale_count_to_put_attn: 1 # at 16 res
+data_min_max: [0, 255] # No need currently, determined by [0, vocab_size]
+dropout: 0.1
+skip_rescale: True
+time_conditioning: True # Whether to add in time embeddings
+time_scale_factor: 1000
+time_embed_dim: ${.ch}
+fix_logistic: False
+size: ${data.size}
+cond_dim: ${.ch}
+length: ${data.length}

muppit/configs/noise/ar.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ type: ar
2	+ scale: 6.0

muppit/configs/noise/linear.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+type: linear
+sigma_min: 1e-3
+sigma_max: 7.0

muppit/configs/noise/loglinear.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+type: loglinear
+sigma_min: 1e-4
+sigma_max: 20

muppit/configs/noise/polynomial.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+type: polynomial
+a: -3
+b: 5
+c: -4
+eps: 1e-3

muppit/configs/strategy/ddp.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ _target_: lightning.pytorch.strategies.DDPStrategy
2	+ find_unused_parameters: false

muppit/configs/strategy/fsdp.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+# TODO(yair): Currently not compatible with grad clipping
+_target_: lightning.pytorch.strategies.FSDPStrategy
+sharding_strategy: SHARD_GRAD_OP

muppit/custom_datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import discretized_cifar10
2	+ from . import ten_species_dataset