Spaces:

HuangLab
/

CELL-E_2-Sequence_Prediction

Sleeping

App Files Files Community

Emaad commited on May 16, 2023

Commit

548170b

1 Parent(s): 65b1781

file upload

Browse files

Files changed (32) hide show

README.md +9 -6
app.py +123 -0
celle/__init__.py +4 -0
celle/attention.py +253 -0
celle/celle.py +1061 -0
celle/reversible.py +36 -0
celle/transformer.py +213 -0
celle/utils.py +228 -0
celle/vae.py +112 -0
celle_main.py +619 -0
celle_taming_main.py +695 -0
dataloader.py +308 -0
images/Armadillo repeat-containing X-linked protein 5 nucleus.jpg +0 -0
images/Armadillo repeat-containing X-linked protein 5 protein.jpg +0 -0
prediction.py +82 -0
taming/lr_scheduler.py +34 -0
taming/models/cond_transformer.py +349 -0
taming/models/dummy_cond_stage.py +22 -0
taming/models/vqgan.py +649 -0
taming/modules/autoencoder/lpips/vgg.pth +3 -0
taming/modules/diffusionmodules/model.py +776 -0
taming/modules/discriminator/model.py +67 -0
taming/modules/losses/__init__.py +2 -0
taming/modules/losses/lpips.py +123 -0
taming/modules/losses/segmentation.py +22 -0
taming/modules/losses/vqperceptual.py +182 -0
taming/modules/misc/coord.py +31 -0
taming/modules/transformer/mingpt.py +415 -0
taming/modules/transformer/permuter.py +248 -0
taming/modules/util.py +130 -0
taming/modules/vqvae/quantize.py +445 -0
taming/util.py +157 -0

README.md CHANGED Viewed

@@ -1,12 +1,15 @@
 ---
-title: CELL-E 2-Sequence Prediction
-emoji: 💻
-colorFrom: blue
-colorTo: green
 sdk: gradio
-sdk_version: 3.29.0
 app_file: app.py
-pinned: false
 license: mit
 ---

 ---
+title: CELL-E 2 - Sequence Prediction
+emoji: 🔬
+colorFrom: red
+colorTo: purple
 sdk: gradio
+python_version: 3.11
+sdk_version: 3.30.0
 app_file: app.py
+tags: [proteins, image-to-text]
+fullWidth: true
+pinned: true
 license: mit
 ---

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import gradio as gr
+from huggingface_hub import hf_hub_download
+from prediction import run_sequence_prediction
+import torch
+import torchvision.transforms as T
+from celle.utils import process_image
+from PIL import Image
+from matplotlib import pyplot as plt
+def gradio_demo(model_name, sequence_input, image):
+    model = hf_hub_download(repo_id=f"HuangLab/{model_name}", filename="model.ckpt")
+    config = hf_hub_download(repo_id=f"HuangLab/{model_name}", filename="config.yaml")
+    hf_hub_download(repo_id=f"HuangLab/{model_name}", filename="nucleus_vqgan.yaml")
+    hf_hub_download(repo_id=f"HuangLab/{model_name}", filename="threshold_vqgan.yaml")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if "Finetuned" in model_name:
+        dataset = "OpenCell"
+    else:
+        dataset = "HPA"
+    nucleus_image = image['image']
+    protein_image = image['mask']
+    nucleus_image = process_image(nucleus_image, dataset, "nucleus")
+    protein_image = process_image(protein_image, dataset, "nucleus")
+    protein_image = 1.0*(protein_image > .5)
+    print(f'{nucleus_image=}')
+    print(f'{protein_image.shape=}')
+    threshold, heatmap = run_sequence_prediction(
+        sequence_input=sequence_input,
+        nucleus_image=nucleus_image,
+        protein_image=protein_image,
+        model_ckpt_path=model,
+        model_config_path=config,
+        device=device,
+    )
+    protein_image = protein_image[0, 0]
+    protein_image = protein_image * 1.0
+    # Plot the heatmap
+    plt.imshow(heatmap.cpu(), cmap="rainbow", interpolation="bicubic")
+    plt.axis("off")
+    # Save the plot to a temporary file
+    plt.savefig("temp.png", bbox_inches="tight", dpi=256)
+    # Open the temporary file as a PIL image
+    heatmap = Image.open("temp.png")
+    return (
+        T.ToPILImage()(nucleus_image[0, 0]),
+        T.ToPILImage()(protein_image),
+        T.ToPILImage()(threshold),
+        heatmap,
+    )
+with gr.Blocks() as demo:
+    gr.Markdown("Select the prediction model.")
+    gr.Markdown(
+        "CELL-E_2_HPA_2560 is a good general purpose model for various cell types using ICC-IF."
+    )
+    gr.Markdown(
+        "CELL-E_2_OpenCell_2560 is trained on OpenCell and is good more live-cell predictions on HEK cells."
+    )
+    with gr.Row():
+        model_name = gr.Dropdown(
+            ["CELL-E_2_HPA_2560", "CELL-E_2_OpenCell_2560"],
+            value="CELL-E_2_HPA_2560",
+            label="Model Name",
+        )
+    with gr.Row():
+        gr.Markdown(
+            "Input the desired amino acid sequence. GFP is shown below by default."
+        )
+    with gr.Row():
+        sequence_input = gr.Textbox(
+            value="MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK",
+            label="Sequence",
+        )
+    with gr.Row():
+        gr.Markdown(
+            "Uploading a nucleus image is necessary. A random crop of 256 x 256 will be applied if larger. We provide default images in [images](https://huggingface.co/spaces/HuangLab/CELL-E_2/tree/main/images)"
+        )
+        gr.Markdown("The protein image is optional and is just used for display.")
+    with gr.Row().style(equal_height=True):
+        nucleus_image = gr.Image(
+            source="upload",
+            tool="sketch",
+            label="Nucleus Image",
+            line_color="white",
+            interactive=True,
+            image_mode="L",
+            type="pil"
+        )
+    with gr.Row():
+        gr.Markdown("Image predictions are show below.")
+    with gr.Row().style(equal_height=True):
+        predicted_sequence = gr.Textbox(
+            label="Predicted Sequence",
+        )
+    with gr.Row():
+        button = gr.Button("Run Model")
+        inputs = [model_name, sequence_input, nucleus_image]
+        outputs = [predicted_sequence]
+        button.click(gradio_demo, inputs, outputs)
+demo.launch(share=True)

celle/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from celle.celle import CELLE
+from celle.vae import VQGanVAE
+__version__ = "2.0.0"

celle/attention.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from rotary_embedding_torch import apply_rotary_emb
+from celle.utils import exists, default, max_neg_value
+# helpers
+def stable_softmax(t, dim=-1, alpha=32**2):
+    t = t / alpha
+    t = t - torch.amax(t, dim=dim, keepdim=True).detach()
+    return (t * alpha).softmax(dim=dim)
+def apply_pos_emb(pos_emb, qkv):
+    n = qkv[0].shape[-2]
+    pos_emb = pos_emb[..., :n, :]
+    return tuple(map(lambda t: apply_rotary_emb(pos_emb, t), qkv))
+# classes
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        seq_len,
+        causal=False,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        stable=False,
+        static_mask=None,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        self.heads = heads
+        self.seq_len = seq_len
+        self.scale = dim_head**-0.5
+        self.stable = stable
+        self.causal = causal
+        self.register_buffer("static_mask", static_mask, persistent=False)
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
+        self.to_out = nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+        self.save_attn = nn.Identity()
+    def forward(self, x, context_mask=None, rotary_pos_emb=None):
+        # x: [batch_size, seq_len, dim]
+        b, n, _, h = *x.shape, self.heads
+        device = x.device
+        softmax = torch.softmax if not self.stable else stable_softmax
+        # qkv: 3 tensors of shape [batch_size, seq_len, inner_dim]
+        qkv = self.to_qkv(x).chunk(3, dim=-1)
+        # q,k,v: [batch_size, heads, seq_len, dim_head]
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), qkv)
+        if exists(rotary_pos_emb):
+            q, k, v = apply_pos_emb(rotary_pos_emb[..., :, :], (q, k, v))
+        q *= self.scale
+        # dots: [batch_size, heads, seq_len_i ,seq_len_j]
+        dots = torch.einsum("b h i d, b h j d -> b h i j", q, k)
+        mask_value = max_neg_value(dots)
+        if exists(context_mask):
+            # context_mask: [batch_size ,1 ,1 ,seq_len_j]
+            context_mask = rearrange(context_mask, "b j -> b 1 1 j")
+            context_mask = F.pad(context_mask, (1, 0), value=True)
+            mask_value = -torch.finfo(dots.dtype).max
+            dots = dots.masked_fill(~context_mask, mask_value)
+        if self.causal:
+            i, j = dots.shape[-2:]
+            context_mask = torch.ones(i, j, device=device).triu_(j - i + 1).bool()
+            dots.masked_fill_(context_mask, mask_value)
+        if exists(self.static_mask):
+            dots.masked_fill_(~self.static_mask[:n, :n], mask_value)
+        # attn: [batch_size ,heads ,seq_len_i ,seq_len_j]
+        attn = softmax(dots, dim=-1)
+        attn = self.save_attn(attn)
+        # out: [batch_size ,heads ,seq_len_i ,dim_head]
+        out = torch.einsum("b h n j, b h j d -> b h n d", attn, v)
+        # out: [batch_size ,seq_len_i ,(heads*dim_head)]
+        out = rearrange(out, "b h n d -> b n (h d)")
+        # out: [batch_size ,seq_len_i ,dim]
+        out = self.to_out(out)
+        return out
+# sparse attention with convolutional pattern, as mentioned in the blog post. customizable kernel size and dilation
+class SparseConvCausalAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        seq_len,
+        image_size=32,
+        kernel_size=5,
+        dilation=1,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        stable=False,
+        **kwargs,
+    ):
+        super().__init__()
+        assert kernel_size % 2 == 1, "kernel size must be odd"
+        inner_dim = dim_head * heads
+        self.seq_len = seq_len
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        self.image_size = image_size
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.stable = stable
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
+        self.to_out = nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+    def forward(self, x, mask=None, rotary_pos_emb=None):
+        b, n, _, h, img_size, kernel_size, dilation, seq_len, device = (
+            *x.shape,
+            self.heads,
+            self.image_size,
+            self.kernel_size,
+            self.dilation,
+            self.seq_len,
+            x.device,
+        )
+        softmax = torch.softmax if not self.stable else stable_softmax
+        img_seq_len = img_size**2
+        text_len = seq_len + 1 - img_seq_len
+        # padding
+        padding = seq_len - n + 1
+        mask = default(mask, lambda: torch.ones(b, text_len, device=device).bool())
+        x = F.pad(x, (0, 0, 0, padding), value=0)
+        mask = mask[:, :text_len]
+        # derive query / keys / values
+        qkv = self.to_qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), qkv)
+        if exists(rotary_pos_emb):
+            q, k, v = apply_pos_emb(rotary_pos_emb, (q, k, v))
+        q *= self.scale
+        ((q_text, q_img), (k_text, k_img), (v_text, v_img)) = map(
+            lambda t: (t[:, :-img_seq_len], t[:, -img_seq_len:]), (q, k, v)
+        )
+        # text attention
+        dots_text = einsum("b i d, b j d -> b i j", q_text, k_text)
+        mask_value = max_neg_value(dots_text)
+        i, j = dots_text.shape[-2:]
+        text_causal_mask = torch.ones(i, j, device=device).triu_(j - i + 1).bool()
+        dots_text.masked_fill_(text_causal_mask, mask_value)
+        attn_text = softmax(dots_text, dim=-1)
+        out_text = einsum("b i j, b j d -> b i d", attn_text, v_text)
+        # image attention
+        effective_kernel_size = (kernel_size - 1) * dilation + 1
+        padding = effective_kernel_size // 2
+        k_img, v_img = map(
+            lambda t: rearrange(t, "b (h w) c -> b c h w", h=img_size), (k_img, v_img)
+        )
+        k_img, v_img = map(
+            lambda t: F.unfold(t, kernel_size, padding=padding, dilation=dilation),
+            (k_img, v_img),
+        )
+        k_img, v_img = map(
+            lambda t: rearrange(t, "b (d j) i -> b i j d", j=kernel_size**2),
+            (k_img, v_img),
+        )
+        # let image attend to all of text
+        dots_image = einsum("b i d, b i j d -> b i j", q_img, k_img)
+        dots_image_to_text = einsum("b i d, b j d -> b i j", q_img, k_text)
+        # calculate causal attention for local convolution
+        i, j = dots_image.shape[-2:]
+        img_seq = torch.arange(img_seq_len, device=device)
+        k_img_indices = rearrange(img_seq.float(), "(h w) -> () () h w", h=img_size)
+        k_img_indices = F.pad(
+            k_img_indices, (padding,) * 4, value=img_seq_len
+        )  # padding set to be max, so it is never attended to
+        k_img_indices = F.unfold(k_img_indices, kernel_size, dilation=dilation)
+        k_img_indices = rearrange(k_img_indices, "b j i -> b i j")
+        # mask image attention
+        q_img_indices = rearrange(img_seq, "i -> () i ()")
+        causal_mask = q_img_indices < k_img_indices
+        # concat text mask with image causal mask
+        causal_mask = repeat(causal_mask, "() i j -> b i j", b=b * h)
+        mask = repeat(mask, "b j -> (b h) i j", i=i, h=h)
+        mask = torch.cat((~mask, causal_mask), dim=-1)
+        # image can attend to all of text
+        dots = torch.cat((dots_image_to_text, dots_image), dim=-1)
+        dots.masked_fill_(mask, mask_value)
+        attn = softmax(dots, dim=-1)
+        # aggregate
+        attn_image_to_text, attn_image = attn[..., :text_len], attn[..., text_len:]
+        out_image_to_image = einsum("b i j, b i j d -> b i d", attn_image, v_img)
+        out_image_to_text = einsum("b i j, b j d -> b i d", attn_image_to_text, v_text)
+        out_image = out_image_to_image + out_image_to_text
+        # combine attended values for both text and image
+        out = torch.cat((out_text, out_image), dim=1)
+        out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
+        out = self.to_out(out)
+        return out[:, :n]

celle/celle.py ADDED Viewed

	@@ -0,0 +1,1061 @@

+# Import necessary packages and modules
+from math import floor, ceil
+import torch
+from torch import nn
+import torch.nn.functional as F
+from axial_positional_embedding import AxialPositionalEmbedding
+from einops import rearrange
+from celle.utils import (
+    exists,
+    always,
+    eval_decorator,
+    gumbel_sample,
+    top_k,
+    gamma_func,
+    DivideMax,
+)
+from tqdm import tqdm
+# Import additional modules from within the codebase
+from celle.transformer import Transformer
+def generate_mask(gamma_func, batch_size, length, device):
+    # Get the number of `True` values in the mask for each batch element
+    num_true_values = floor(gamma_func(torch.rand(1)) * length)
+    # Generate a random sample of indices to set to `True` in the mask
+    # The number of indices in the sample is determined by `num_true_values`
+    indices = (
+        torch.rand((batch_size, length), device=device)
+        .topk(num_true_values, dim=1)
+        .indices
+    )
+    # Create a binary mask tensor with `True` values at the sampled indices
+    mask = torch.zeros((batch_size, length), dtype=torch.bool, device=device)
+    mask.scatter_(dim=1, index=indices, value=True)
+    return mask
+def match_batch_size(text, condition, image, batch_size):
+    """
+    This function ensures all inputs to the sample function have the same batch size.
+    """
+    if text.shape[0] != batch_size:
+        text = text.repeat(batch_size, 1)
+    if condition.shape[0] != batch_size:
+        condition = condition.repeat(batch_size, 1)
+    if image.shape[0] != batch_size:
+        image = image.repeat(batch_size, 1)
+    return text, condition, image
+def calc_unmask_probs(timestep, timesteps, gamma_func):
+    if timestep == 1 or timesteps == 1:
+        unmask_prob = 1
+    else:
+        unmask_prob = 1 - gamma_func(timestep)
+    return unmask_prob
+def calculate_logits(
+    input_tokens, input_mask, logits_function, filter_thres, temperature
+):
+    logits, _, _ = logits_function(input_tokens, input_mask, return_encoding=False)
+    filtered_logits = top_k(logits, thres=filter_thres)
+    sample = gumbel_sample(filtered_logits, temperature=temperature, dim=-1)
+    return logits, sample
+def unmask_tokens(
+    input_tokens,
+    input_mask,
+    num_masked_tokens,
+    logits,
+    sample,
+    timestep,
+    timesteps,
+    gamma,
+    filter_func=None,
+    pad_token=None,
+    mask_token=None,
+    force_aas=True,
+):
+    sample = sample.masked_fill(~input_mask.unsqueeze(-1), -torch.inf)
+    if filter_func:
+        sample = filter_func(
+            input_tokens, sample, force_aas, pad_token=pad_token, mask_token=mask_token
+        )
+    selected_token_probs, selected_tokens = torch.max(sample, dim=-1)
+    unmask_prob = calc_unmask_probs(timestep, timesteps, gamma)
+    num_tokens_to_unmask = max(1, ceil(unmask_prob * num_masked_tokens))
+    _, top_k_indices = torch.topk(selected_token_probs, num_tokens_to_unmask, dim=-1)
+    sample_mask = torch.zeros(
+        input_tokens.shape, dtype=torch.bool, device=input_tokens.device
+    )
+    sample_mask.scatter_(dim=1, index=top_k_indices, value=True)
+    unmasked_tokens = torch.where(sample_mask, selected_tokens, input_tokens)
+    full_logits = torch.where(
+        sample_mask.unsqueeze(-1), logits, torch.zeros_like(logits)
+    )
+    return unmasked_tokens, full_logits
+def suppress_invalid_text_tokens(
+    text,
+    logits,
+    start_token=None,
+    end_token=None,
+    pad_token=None,
+    mask_token=None,
+    force_aas=False,
+):
+    # Find the indices of start_token and end_token in tensor text along axis=1
+    idx_start = (text == start_token).nonzero(as_tuple=True)[1]
+    idx_end = (text == end_token).nonzero(as_tuple=True)[1]
+    # For every position other than the index corresponding to the start index, set the values on the start index of dimension=2 to -torch.inf
+    if idx_start.nelement() != start_token:
+        try:
+            mask = idx_start.unsqueeze(1) != torch.arange(
+                logits.size(1), device=text.device
+            )
+            indices = torch.where(mask)
+            logits[indices[0], indices[1], start_token] = -torch.inf
+        except:
+            pass
+    # else:
+    #     idx_start = torch.zeros(text.size(0), dtype=torch.long)
+    # Similarly, for every position other than the index corresponding to the end index, set the values on the end index of dimension=2 to -torch.inf
+    if idx_end.nelement() != 0:
+        try:
+            mask = idx_end.unsqueeze(1) != torch.arange(
+                logits.size(1), device=text.device
+            )
+            indices = torch.where(mask)
+            logits[indices[0], indices[1], end_token] = -torch.inf
+        except:
+            pass
+    # else:
+    #     idx_end = torch.full((text.size(0),), text.size(1) - 1, dtype=torch.long)
+    if pad_token:
+        if idx_start.nelement() != 0 and idx_end.nelement() != 0:
+            try:
+                # For every position between the indices of start_token and end_token, set the values for 1st index of dimension=2 equal to -torch.inf. Any value outside of that range should be set to torch.inf.
+                mask = (
+                    torch.arange(logits.size(1), device=text.device)
+                    >= idx_start.unsqueeze(1)
+                ) & (
+                    torch.arange(logits.size(1), device=text.device)
+                    <= idx_end.unsqueeze(1)
+                )
+                indices = torch.where(mask)
+                logits[indices[0], indices[1], pad_token] = -torch.inf
+                indices = torch.where(~mask)
+                logits[indices[0], indices[1], pad_token] = torch.inf
+            except:
+                pass
+        elif idx_start.nelement() != 0:
+            try:
+                mask = torch.arange(
+                    logits.size(1), device=text.device
+                ) < idx_start.unsqueeze(1)
+                logits[indices[0], indices[1], pad_token] = torch.inf
+            except:
+                pass
+        elif idx_end.nelement() != 0:
+            try:
+                mask = torch.arange(
+                    logits.size(1), device=text.device
+                ) > idx_end.unsqueeze(1)
+                logits[indices[0], indices[1], pad_token] = torch.inf
+            except:
+                pass
+    if force_aas:
+        if pad_token:
+            logits[:, :, pad_token] = -torch.inf
+        logits[:, :, 3] = -torch.inf
+        logits[:, :, 29:] = -torch.inf
+    if mask_token:
+        logits[:, :, mask_token] = -torch.inf
+    return logits
+def detokenize_text(text_embedding, sequence):
+    if text_embedding == "esm1b" or text_embedding == "esm2":
+        from esm import Alphabet
+        alphabet = (
+            Alphabet.from_architecture("ESM-1b").get_batch_converter().alphabet.all_toks
+        )
+    else:
+        assert NameError("Detokenization only available for ESM mdodels")
+    output_seqs = []
+    for batch in sequence:
+        converted_seq = [alphabet[idx] for idx in batch]
+        converted_seq = "".join(converted_seq)
+        output_seqs.append(converted_seq)
+    return output_seqs
+class ImageEmbedding(nn.Module):
+    def __init__(self, num_tokens, dim):
+        super(ImageEmbedding, self).__init__()
+        self.image_embedding = nn.Embedding(num_tokens, dim)
+    def forward(self, image):
+        return self.image_embedding(image)
+class ModelExtender(nn.Module):
+    def __init__(self, vocab, out_features, fixed_embedding=False):
+        super(ModelExtender, self).__init__()
+        # Initialize the model according to the given vocabulary
+        self.vocab = vocab
+        if vocab == "esm1b":
+            from esm import pretrained
+            self.model, _ = pretrained.esm1b_t33_650M_UR50S()
+            self.in_features = 1280
+        elif vocab == "esm2":
+            from esm import pretrained
+            if out_features == 320:
+                self.model, _ = pretrained.esm2_t6_8M_UR50D()
+            elif out_features == 480:
+                self.model, _ = pretrained.esm2_t12_35M_UR50D()
+            elif out_features == 640:
+                self.model, _ = pretrained.esm2_t30_150M_UR50D()
+            elif out_features == 1280:
+                self.model, _ = pretrained.esm2_t33_650M_UR50D()
+            elif out_features == 2560:
+                self.model, _ = pretrained.esm2_t36_3B_UR50D()
+            else:
+                self.model, _ = pretrained.esm2_t33_650M_UR50D()
+            self.in_features = self.model.embed_dim
+        # Set the number of output features and initialize the scaling layer
+        self.out_features = out_features
+        self.scale_layer = nn.Linear(self.in_features, self.out_features)
+        # Determine whether to freeze the model's parameters
+        self.fixed_embedding = fixed_embedding
+        if self.fixed_embedding:
+            self.model = self.model.eval()
+    def forward(self, x, **kwargs):
+        # If the model's parameters are fixed, use torch.no_grad()
+        if self.fixed_embedding:
+            with torch.no_grad():
+                if self.vocab == "esm1b" or self.vocab == "esm2":
+                    # Reduce sequence length dimension, get top layer representation tensor
+                    x = self.model(x.squeeze(1), repr_layers=[self.model.num_layers])[
+                        "representations"
+                    ][self.model.num_layers]
+                    # Tensor shape: (batch_size, hidden_size)
+                else:
+                    # Get top layer representation tensor
+                    x = self.model(x, **kwargs)[0]
+                    # Tensor shape: (batch_size, sequence_length, hidden_size)
+        else:
+            if self.vocab == "esm1b" or self.vocab == "esm2":
+                # Reduce sequence length dimension, get top layer representation tensor
+                x = self.model(x.squeeze(1), repr_layers=[self.model.num_layers])[
+                    "representations"
+                ][self.model.num_layers]
+                # Tensor shape: (batch_size, hidden_size)
+            else:
+                # Get top layer representation tensor
+                x = self.model(x, **kwargs)[0]
+                # Tensor shape: (batch_size, sequence_length, hidden_size)
+        # Scale the representation tensor if necessary
+        if self.out_features != self.in_features:
+            x = self.scale_layer(x)
+            # Tensor shape: (batch_size, out_features)
+        return x
+class CELLE(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        vae,  # The VAE model used to encode/decode images
+        condition_vae=None,  # An optional VAE model used to condition the image generation
+        num_images=2,  # Number of images to generate
+        num_text_tokens=30,  # Number of tokens in the text vocabulary
+        text_seq_len=1000,  # Maximum length of input text sequence
+        depth=16,  # Number of layers in the transformer model
+        heads=16,  # Number of attention heads
+        dim_head=64,  # Dimensionality of each attention head
+        attn_dropout=0.1,  # Dropout rate for attention weights
+        ff_dropout=0.1,  # Dropout rate for feedforward layers
+        attn_types=None,  # Types of attention to use in the transformer
+        causal=False,  # Whether to use causal attention
+        loss_cond_weight=1,  # Weight of conditioning loss
+        loss_img_weight=1,  # Weight of image generation loss
+        stable=False,  # Whether to use divide-by-max normalization in the transformer
+        rotary_emb=True,  # Whether to use rotary positional embeddings
+        text_embedding="esm2",  # Text embedding to use (esm1b, esm2)
+        fixed_embedding=True,  # Whether to fix the text embedding or learn it
+        sampling_mode="cosine",  # Sampling mode for the VAE
+        linear_project=False,  # Whether to project embeddings linearly
+        **kwargs,
+    ):
+        super().__init__()
+        # Set the stable flag
+        self.stable = stable
+        # If the stable flag is set, initialize the DivideMax layer for normalization
+        if stable:
+            self.norm_by_max = DivideMax(dim=-1)
+        ### Initializing text parameters ###
+        # Initialize the text and fixed embeddings
+        self.text_embedding = text_embedding
+        self.fixed_embedding = fixed_embedding
+        # Offset logits index and calculate cross entropy loss
+        self.num_text_tokens = num_text_tokens
+        self.linear_project = linear_project
+        # Add <BOS> and <EOS> tokens to the beginning and end of text sequences
+        if text_embedding.lower() in ("esm1b", "esm2"):
+            self.text_seq_len = text_seq_len + 2
+        else:
+            self.text_seq_len = text_seq_len
+        # Initialize embeddings for <SEP> token
+        self.sep_emb = nn.Embedding(1, dim)
+        # Initialize positional embeddings for text sequences and <SEP> token
+        self.text_pos_emb = (
+            nn.Embedding(self.text_seq_len + 1, dim) if not rotary_emb else always(0)
+        )  # +1 for <SEP>
+        ### ###
+        self.num_images = num_images
+        ### Initializing condition parameters ###
+        # Initialize the number of condition tokens, condition sequence length, and condition embedding
+        if exists(condition_vae):
+            condition_size = condition_vae.image_size
+            num_condition_tokens = condition_vae.num_tokens
+            self.num_condition_tokens = num_condition_tokens
+            condition_fmap_size = condition_vae.image_size // (
+                2**condition_vae.num_layers
+            )
+            condition_seq_len = condition_fmap_size**2
+            # Initialize ImageEmbedding for condition embedding
+            self.condition_emb = ImageEmbedding(num_condition_tokens + 1, dim)
+            # Initialize positional embeddings for condition embedding
+            self.condition_pos_emb = (
+                AxialPositionalEmbedding(
+                    dim, axial_shape=(condition_fmap_size, condition_fmap_size)
+                )
+                if not rotary_emb
+                else always(0)
+            )
+        else:
+            condition_fmap_size = 0
+            condition_seq_len = 0
+            num_condition_tokens = 0
+        ### ####
+        ### Initializing image parameters ###
+        # Initialize the image size, image token size, and sequence length
+        self.image_size = vae.image_size
+        num_image_tokens = vae.num_tokens
+        image_fmap_size = vae.image_size // (2**vae.num_layers)
+        image_seq_len = image_fmap_size**2
+        self.image_seq_len = image_seq_len
+        self.num_image_tokens = num_image_tokens
+        # Initialize ImageEmbedding and positional embeddings for image embedding
+        self.image_emb = ImageEmbedding(num_image_tokens + 1, dim) # +1 for <IM_MASK>
+        self.image_pos_emb = (
+            AxialPositionalEmbedding(
+                dim, axial_shape=(image_fmap_size, image_fmap_size)
+            )
+            if not rotary_emb
+            else always(0)
+        )
+        # Set total sequence length and total tokens
+        self.num_condition_tokens = num_condition_tokens
+        self.condition_seq_len = condition_seq_len
+        # Text Length + <SEP> + Condition Tokens + Image Tokens
+        seq_len = self.text_seq_len + 1 + self.condition_seq_len + self.image_seq_len
+        total_tokens = (
+            num_text_tokens + 1 + num_condition_tokens + 1 + num_image_tokens + 1
+        )
+        self.total_tokens = total_tokens
+        self.total_seq_len = seq_len
+        # Set the VAE and condition VAE for the model
+        self.vae = vae.eval()
+        self.condition_vae = condition_vae.eval()
+        ### ###
+        ### Setting discrete ids ###
+        # Initialize text embedding based on the given text_embedding parameter
+        if text_embedding == "esm1b" or text_embedding == "esm2":
+            self.text_mask_token = 32
+            self.pad_token = 1
+            self.text_emb = ModelExtender(text_embedding, dim, fixed_embedding)
+        else:
+            raise ValueError("Only ESM models are supported.")
+        # Set token indices for text, condition, and image sequences
+        self.sep_token = num_text_tokens
+        self.cond_mask_token = num_condition_tokens
+        self.image_mask_token = num_image_tokens
+        # Create indices for sequence and logits dimensions
+        self.seq_range = torch.arange(seq_len)
+        self.logits_range = torch.arange(total_tokens)
+        # Reshape sequence and logits indices
+        self.seq_range = rearrange(self.seq_range, "n -> () n ()")
+        self.logits_range = rearrange(self.logits_range, "d -> () () d")
+        # Create a mask to exclude invalid token positions from the model output
+        # e.g. no image tokens where sequence tokens should be
+        logits_mask = (
+            # Mask text tokens beyond text_seq_len and invalid logits_range
+            (
+                (self.seq_range < self.text_seq_len)
+                & (self.logits_range < num_text_tokens)
+                & (self.logits_range != self.text_mask_token)
+            )
+            |
+            # Mask [SEP] token after text
+            (
+                (self.seq_range == self.text_seq_len)
+                & (self.logits_range == num_text_tokens)
+            )
+            |
+            # Mask condition tokens beyond text_seq_len+1 ([SEP]) and invalid logits_range
+            (
+                (self.seq_range >= self.text_seq_len + 1)
+                & (self.seq_range < self.text_seq_len + 1 + condition_seq_len)
+                & (self.logits_range >= num_text_tokens + 1)
+                & (self.logits_range < num_text_tokens + 1 + num_condition_tokens)
+            )
+            |
+            # Mask image tokens beyond num_text_tokens+num_condition_tokens+1
+            (
+                (self.seq_range >= self.text_seq_len + 1 + condition_seq_len)
+                & (self.logits_range >= num_text_tokens + 1 + num_condition_tokens + 1)
+                & (
+                    self.logits_range
+                    < num_text_tokens + 1 + num_condition_tokens + 1 + num_image_tokens
+                )
+            )
+        )
+        # Invert the mask
+        logits_mask = ~logits_mask
+        # Register the buffer with the logits_mask
+        self.register_buffer("logits_mask", logits_mask, persistent=False)
+        ### ###
+        # Initialize the Transformer model with given parameters
+        self.transformer = Transformer(
+            dim=dim,
+            causal=causal,
+            seq_len=seq_len,
+            depth=depth,
+            heads=heads,
+            dim_head=dim_head,
+            attn_dropout=attn_dropout,
+            ff_dropout=ff_dropout,
+            image_fmap_size=image_fmap_size + condition_fmap_size,
+            num_images=num_images,
+            stable=stable,
+            rotary_emb=rotary_emb,
+        )
+        # Initialize the linear layers for converting transformer output to logits
+        self.to_logits = nn.Sequential(
+            nn.LayerNorm(dim),
+            nn.Linear(dim, self.total_tokens),
+        )
+        # Set instance variables for weights and critic
+        self.loss_img_weight = loss_img_weight
+        self.loss_cond_weight = loss_cond_weight
+        self.gamma = gamma_func(sampling_mode)
+    def embed_and_transform(self, inputs, masks, return_encoding=False):
+        text, condition, image = inputs
+        device = text.device
+        text_mask, _, image_mask = masks
+        text_labels = text.clone()
+        text = torch.where(
+            text_mask, self.text_mask_token * torch.ones_like(text, device=device), text
+        )
+        tokens = self.text_emb(text)
+        # Add SEP token
+        sep_token_emb = self.sep_emb(
+            torch.zeros((tokens.shape[0], 1), dtype=torch.long, device=device)
+        )
+        tokens = torch.cat((tokens, sep_token_emb), dim=1)
+        tokens += self.text_pos_emb(torch.arange(text.shape[1] + 1, device=device))
+        with torch.no_grad():
+            if self.linear_project:
+                b = condition.shape[0]
+                condition, _, [_, _, condition_labels] = self.condition_vae.encode(
+                    condition
+                )
+                condition_labels = rearrange(condition_labels, "(b n) -> b n", b=b)
+            else:
+                condition_labels = condition
+                if condition.dtype == torch.float:
+                    condition_labels = self.condition_vae.get_codebook_indices(
+                        condition
+                    )
+                condition = condition_labels.clone()
+        condition_emb = self.condition_emb(condition)
+        condition_emb += self.condition_pos_emb(condition_emb)
+        tokens = torch.cat((tokens, condition_emb), dim=1)
+        with torch.no_grad():
+            if self.linear_project:
+                b = image.shape[0]
+                image, _, [_, _, image_labels] = self.vae.encode(image)
+                image_labels = rearrange(image_labels, "(b n) -> b n", b=b)
+            else:
+                image_labels = image
+                if image.dtype == torch.float:
+                    image_labels = self.vae.get_codebook_indices(image)
+                image = torch.where(
+                    image_mask,
+                    self.image_mask_token
+                    * torch.ones_like(image_labels, device=device),
+                    image_labels,
+                )
+        image_emb = self.image_emb(image)
+        image_emb += self.image_pos_emb(image_emb)
+        tokens = torch.cat((tokens, image_emb), dim=1)
+        if self.stable:
+            alpha = 0.1
+            tokens = tokens * alpha + tokens.detach() * (1 - alpha)
+        out = self.transformer(tokens)
+        if self.stable:
+            out = self.norm_by_max(out)
+        logits = self.to_logits(out)
+        max_neg_value = -torch.finfo(logits.dtype).max
+        logits.masked_fill_(self.logits_mask, max_neg_value)
+        if return_encoding:
+            return logits, out, [text_labels, condition_labels, image_labels]
+        else:
+            return logits, None, [text_labels, condition_labels, image_labels]
+    def forward(
+        self,
+        text,
+        condition=None,
+        image=None,
+        return_loss=False,
+        return_encoding=False,
+    ):
+        batch_size, device = text.shape[0], text.device
+        # Check that image is supplied when training
+        assert exists(image), "when training, image must be supplied"
+        # Check that image dimensions match the expected dimensions
+        assert tuple(image.shape[1:]) == (
+            self.vae.channels,
+            self.image_size,
+            self.image_size,
+        ), f"invalid image of dimensions {image.shape} passed in during training"
+        # Generate masks for text, condition, and image
+        # text_mask = generate_mask(self.gamma, batch_size, self.text_seq_len, device)
+        text_mask = generate_mask(
+            gamma_func("scaled-cosine"), batch_size, self.text_seq_len, device
+        )
+        image_mask = generate_mask(self.gamma, batch_size, self.image_seq_len, device)
+        # Embed and transform inputs
+        logits, _, labels = self.embed_and_transform(
+            [text, condition, image],
+            [text_mask, None, image_mask],
+            return_encoding,
+            device,
+        )
+        # If not returning loss, return the logits
+        if not return_loss:
+            return logits
+        # Separate labels
+        text, condition, image = labels
+        # Add SEP token to end of text label
+        sep_token = torch.tensor(self.sep_token, device=device).repeat(
+            labels.shape[0], 1
+        )
+        labels = torch.cat([labels, sep_token], dim=1)
+        # If condition exists and condition vae is defined, add the condition to the labels
+        if exists(condition) and exists(self.condition_vae):
+            offsetted_condition = condition + self.num_text_tokens + 1
+            labels = torch.cat((labels, offsetted_condition), dim=1)
+        # Add image to the labels
+        offsetted_image = (
+            image + self.num_text_tokens + 1 + self.num_condition_tokens + 1
+        )
+        labels = torch.cat((labels, offsetted_image), dim=1)
+        # Rearrange logits for cross-entropy loss calculation
+        # Logits size: (batch_size, vocab_size, total_seq_len)
+        # Labels size: (batch_size, total_seq_len)
+        logits = rearrange(logits, "b n c -> b c n")
+        # Calculate cross-entropy loss for text and image
+        loss_text = F.cross_entropy(
+            logits[:, :, : self.text_seq_len],
+            labels[:, : self.text_seq_len],
+            reduction="none",
+        )[text_mask].mean()
+        loss_img = F.cross_entropy(
+            logits[:, :, self.text_seq_len + 1 + self.condition_seq_len :],
+            labels[:, self.text_seq_len + 1 + self.condition_seq_len :],
+            reduction="none",
+        )[image_mask].mean()
+        # Calculate total loss
+        loss = (loss_text + self.loss_img_weight * loss_img) / (
+            self.loss_img_weight + 1
+        )
+        loss_dict = {
+            "loss_text": loss_text,
+            # "loss_cond": loss_cond,
+            "loss_img": loss_img,
+            "loss": torch.nan_to_num(loss, 0.0, 0.0, 0.0),
+        }
+        return loss, loss_dict, None
+    def create_tensors(self, text, condition, image):
+        """
+        This function creates tensors for text, condition, and image when they are not provided as inputs to the sample function.
+        """
+        device = next(
+            filter(lambda x: isinstance(x, torch.Tensor), [text, condition, image]),
+            None,
+        ).device
+        if not isinstance(text, torch.Tensor):
+            text = (
+                torch.ones(1, self.text_seq_len, device=device, dtype=torch.long)
+                * self.text_mask_token
+            )
+        if not isinstance(condition, torch.Tensor):
+            condition = (
+                torch.ones(1, self.condition_seq_len, device=device, dtype=torch.long)
+                * self.cond_mask_token
+            )
+        else:
+            with torch.no_grad():
+                condition = self.condition_vae.get_codebook_indices(condition)
+        if not isinstance(image, torch.Tensor):
+            image = (
+                torch.ones(1, self.image_seq_len, device=device, dtype=torch.long)
+                * self.image_mask_token
+            )
+        else:
+            with torch.no_grad():
+                image = self.vae.get_codebook_indices(image)
+        return text, condition, image
+    @torch.no_grad()
+    @eval_decorator
+    def sample(
+        self,
+        text=None,
+        condition=None,
+        image=None,
+        temperature=1.0,
+        filter_thres=0.9,
+        progress=False,
+        timesteps=1,
+        force_aas=True,
+    ):
+        # ensure timesteps is a positive integer
+        assert int(timesteps) > 0
+        # set model and VAEs to evaluation mode
+        self.eval()
+        vae = self.vae.eval()
+        if progress == True:
+            progress = tqdm
+        else:
+            progress = lambda x: x
+        # ensure that at least one of text, condition, or image is supplied
+        assert (
+            isinstance(text, torch.Tensor)
+            or isinstance(condition, torch.Tensor)
+            or isinstance(image, torch.Tensor)
+        ), "some data must be supplied"
+        # convert text, condition, and image to tensors if they aren't already
+        text, condition, image = self.create_tensors(text, condition, image)
+        # determine the maximum batch size of the input tensors
+        batch_size = max(text.shape[0], condition.shape[0], image.shape[0])
+        # match the batch sizes of text, condition, and image
+        text, condition, image = match_batch_size(text, condition, image, batch_size)
+        # determine the device of the tensors
+        device = next(
+            filter(lambda x: isinstance(x, torch.Tensor), [text, condition, image]),
+            None,
+        ).device
+        assert text.shape[0] == condition.shape[0] == image.shape[0]
+        # Create a tensor of zeros of size (batch_size, image_seq_len, num_image_tokens + 1) and set it to device
+        # full_text_logits = torch.zeros(batch_size, self.text_seq_len, self.num_text_tokens+3).to(device)
+        full_text_logits = torch.zeros(
+            batch_size, self.text_seq_len, self.num_text_tokens
+        ).to(device)
+        # Use scatter_ to fill the tensor with 1 values at the indices given by the image tensor
+        full_text_logits = full_text_logits.scatter_(
+            dim=-1, index=text.unsqueeze(-1), value=1
+        )
+        # Use scatter_ to fill the tensor with 1 values at the indices given by the image tensor
+        full_image_logits = torch.zeros(
+            batch_size, self.image_seq_len, self.num_image_tokens + 1
+        ).to(device)
+        # Remove the last token from each image sequence by setting full_image_logits to its first num_image_tokens elements
+        full_image_logits = full_image_logits.scatter_(
+            dim=-1, index=image.unsqueeze(-1), value=1
+        )
+        # cut off mask token
+        full_image_logits = full_image_logits[:, :, : self.num_image_tokens]
+        count = 0
+        for timestep in progress(torch.linspace(0, 1, timesteps)):
+            # Create masks for the text, condition, and image tensors
+            text_mask = text == self.text_mask_token
+            cond_mask = condition == self.cond_mask_token
+            image_mask = image == self.image_mask_token
+            # Calculate logits and samples using the calculate_logits function
+            logits, sample = calculate_logits(
+                [text, condition, image],
+                [text_mask, cond_mask, image_mask],
+                self.embed_and_transform,
+                filter_thres,
+                temperature,
+            )
+            # Calculate the number of masked tokens in the text and image tensors
+            num_masked_text_tokens = torch.sum(text_mask, dim=1)[0]
+            num_masked_image_tokens = torch.sum(image_mask, dim=1)[0]
+            # If there are masked text tokens, unmask them using unmask_tokens and fill the full text logits tensor with -inf for unmasked tokens
+            if num_masked_text_tokens.any() > 0:
+                text, full_text_logits = unmask_tokens(
+                    text,
+                    text_mask,
+                    num_masked_text_tokens,
+                    logits[:, : self.text_seq_len, : self.num_text_tokens],
+                    sample[:, : self.text_seq_len, : self.num_text_tokens],
+                    timestep,
+                    timesteps,
+                    self.gamma,
+                    suppress_invalid_text_tokens,
+                    self.pad_token,
+                    self.text_mask_token,
+                    force_aas=force_aas,
+                )
+                full_text_logits = full_text_logits.masked_fill(
+                    ~text_mask.unsqueeze(-1), -torch.inf
+                )
+            # If there are masked image tokens, unmask them using unmask_tokens and fill the full image logits tensor with -inf for unmasked tokens
+            if num_masked_image_tokens > 0:
+                image, full_image_logits = unmask_tokens(
+                    image,
+                    image_mask,
+                    num_masked_image_tokens,
+                    logits[:, -self.image_seq_len :, -(self.num_image_tokens + 1) : -1],
+                    sample[:, -self.image_seq_len :, -(self.num_image_tokens + 1) : -1],
+                    timestep,
+                    timesteps,
+                    self.gamma,
+                )
+                full_text_logits = full_text_logits.masked_fill(
+                    ~text_mask.unsqueeze(-1), -torch.inf
+                )
+        # Generate heatmap
+        with torch.no_grad():
+            # Normalize full image logits tensor
+            full_image_logits /= torch.max(
+                torch.abs(full_image_logits), dim=-1, keepdim=True
+            ).values
+            # Apply quantize embedding to full image logits tensor
+            full_image_logits = torch.matmul(
+                full_image_logits, self.vae.model.quantize.embedding.weight
+            )
+            # Rearrange full image logits tensor
+            h = int(self.image_seq_len**0.5)
+            full_image_logits = rearrange(
+                full_image_logits, "b (h w) c -> b c h w", h=h
+            )
+            # Decode full image logits tensor
+            full_image_logits = self.vae.model.decode(full_image_logits)
+            # Add clipping to full image logits tensor
+            max_val = torch.max(full_image_logits.view(batch_size, -1), dim=-1)[0]
+            min_val = torch.min(full_image_logits.view(batch_size, -1), dim=-1)[0]
+            full_image_logits += torch.clip(1 - max_val, 0, float("inf")).view(
+                batch_size, 1, 1, 1
+            )
+            full_image_logits += torch.clip(0 - min_val, float("-inf"), 0).view(
+                batch_size, 1, 1, 1
+            )
+            # Clip full image logits tensor values to the range [0, 1]
+            full_image_logits = torch.clip(full_image_logits, 0, 1)
+        # Return text tensor, detokenized text tensor, full text logits tensor,
+        # binary image tensor, and full image logits tensor
+        return (
+            text,
+            detokenize_text(self.text_embedding, text),
+            full_text_logits,
+            1.0 * (vae.decode(image) > 0.5),
+            full_image_logits,
+        )
+    @torch.no_grad()
+    @eval_decorator
+    def sample_text(
+        self,
+        text=False,
+        condition=False,
+        image=False,
+        temperature=1.0,
+        filter_thres=0.9,
+        progress=False,
+        n_unmask=1,
+        place_amino=True,
+        force_aas=False,
+    ):
+        # set model and VAEs to evaluation mode
+        self.eval()
+        # ensure that at least one of text, condition, or image is supplied
+        assert (
+            isinstance(text, torch.Tensor)
+            or isinstance(condition, torch.Tensor)
+            or isinstance(image, torch.Tensor)
+        ), "some data must be supplied"
+        # convert text, condition, and image to tensors if they aren't already
+        text, condition, image = self.create_tensors(text, condition, image)
+        # determine the maximum batch size of the input tensors
+        batch_size = max(text.shape[0], condition.shape[0], image.shape[0])
+        # match the batch sizes of text, condition, and image
+        text, condition, image = match_batch_size(text, condition, image, batch_size)
+        # determine the device of the tensors
+        device = next(
+            filter(lambda x: isinstance(x, torch.Tensor), [text, condition, image]),
+            None,
+        ).device
+        assert text.shape[0] == condition.shape[0] == image.shape[0]
+        # Create a tensor of zeros of size (batch_size, image_seq_len, num_image_tokens + 1) and set it to device
+        # full_text_logits = torch.zeros(batch_size, self.text_seq_len, self.num_text_tokens+3).to(device)
+        full_text_logits = torch.zeros(
+            batch_size, self.text_seq_len, self.num_text_tokens
+        ).to(device)
+        # Use scatter_ to fill the tensor with 1 values at the indices given by the image tensor
+        full_text_logits = full_text_logits.scatter_(
+            dim=-1, index=text.unsqueeze(-1), value=1
+        )
+        text_mask = text == self.text_mask_token
+        cond_mask = condition == self.cond_mask_token
+        image_mask = image == self.image_mask_token
+        mask_indices = text_mask.nonzero()
+        non_mask_indices = (~text_mask).nonzero()
+        # figure out the center of the amino acids to determine generation direction
+        central_protein_index = torch.tensor(
+            [
+                torch.median(
+                    non_mask_indices[torch.where(non_mask_indices[:, 0] == idx)][:, -1]
+                )
+                for idx in range(batch_size)
+            ]
+        )
+        count = 1
+        run_mask = text_mask
+        if progress:
+            pbar = progress(total=torch.sum(run_mask).item())
+        while torch.sum(run_mask) > 0:
+            logits, sample = calculate_logits(
+                [text, condition, image],
+                [text_mask, cond_mask, image_mask],
+                self.embed_and_transform,
+                filter_thres,
+                temperature,
+            )
+            # sub_sample: [batch_size ,text_seq_len ,num_text_tokens]
+            sub_sample = sample[:, : self.text_seq_len, : self.num_text_tokens]
+            sub_sample = sub_sample.masked_fill(~text_mask.unsqueeze(-1), -torch.inf)
+            sub_sample = suppress_invalid_text_tokens(
+                text, sub_sample, 0, 2, self.pad_token, self.text_mask_token, force_aas
+            )
+            # calculate % to  unmasked
+            # get most likely token and probability for each position
+            for idx in range(batch_size):
+                selected_mask_indices = mask_indices[
+                    torch.where(mask_indices[:, 0] == idx)
+                ][:, -1]
+                # Generate to the left
+                if selected_mask_indices[-count] < central_protein_index[idx]:
+                    unmask_index = selected_mask_indices[-count]
+                    left_sample = max(0, (unmask_index + 1) - n_unmask)
+                    right_sample = min(unmask_index + 1, self.text_seq_len - 1)
+                    central_protein_index[idx] = max(
+                        0, central_protein_index[idx] - 0.5 * n_unmask
+                    )
+                # Generate to the right
+                elif selected_mask_indices[count - 1] > central_protein_index[idx]:
+                    unmask_index = selected_mask_indices[count - 1]
+                    left_sample = max(0, unmask_index)
+                    right_sample = min(unmask_index + n_unmask, self.text_seq_len - 1)
+                    central_protein_index[idx] = min(
+                        central_protein_index[idx] + 0.5 * n_unmask,
+                        self.text_seq_len - 1,
+                    )
+                # save logits for relevant position
+                full_text_logits[
+                    idx, left_sample:right_sample, : self.text_seq_len - 1
+                ] = logits[idx, left_sample:right_sample, : self.num_text_tokens]
+                run_mask[idx, left_sample:right_sample] = False
+                # you may want to resample the amion acids or calculate marginal probs
+                # if so, set place_amino to false
+                if place_amino:
+                    text[idx, left_sample:right_sample] = torch.where(
+                        text[idx, left_sample:right_sample] == self.text_mask_token,
+                        sub_sample[
+                            idx, left_sample:right_sample, : self.num_text_tokens
+                        ].argmax(dim=-1),
+                        text[idx, left_sample:right_sample],
+                    )
+                    text_mask = run_mask
+            count += n_unmask
+            if progress:
+                pbar.update(n_unmask)
+        if progress:
+            pbar.close()
+        return (
+            text,
+            detokenize_text(self.text_embedding, text),
+            full_text_logits,
+        )

celle/reversible.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch.nn as nn
+# for routing arguments into the functions of the reversible layer
+def route_args(router, args, depth):
+    routed_args = [(dict(), dict()) for _ in range(depth)]
+    matched_keys = [key for key in args.keys() if key in router]
+    for key in matched_keys:
+        val = args[key]
+        for depth, ((f_args, g_args), routes) in enumerate(
+            zip(routed_args, router[key])
+        ):
+            new_f_args, new_g_args = map(
+                lambda route: ({key: val} if route else {}), routes
+            )
+            routed_args[depth] = ({**f_args, **new_f_args}, {**g_args, **new_g_args})
+    return routed_args
+class SequentialSequence(nn.Module):
+    def __init__(self, layers, args_route={}, layer_dropout=0.0):
+        super().__init__()
+        assert all(
+            len(route) == len(layers) for route in args_route.values()
+        ), "each argument route map must have the same depth as the number of sequential layers"
+        self.layers = layers
+        self.args_route = args_route
+        self.layer_dropout = layer_dropout
+    def forward(self, x, **kwargs):
+        args = route_args(self.args_route, kwargs, len(self.layers))
+        layers_and_args = list(zip(self.layers, args))
+        for (f, g), (f_args, g_args) in layers_and_args:
+            x = x + f(x, **f_args)
+            x = x + g(x, **g_args)
+        return x

celle/transformer.py ADDED Viewed

	@@ -0,0 +1,213 @@

+from functools import partial
+import torch
+from torch import nn
+import torch.nn.functional as F
+from einops import rearrange
+from celle.reversible import SequentialSequence
+from celle.attention import Attention
+from rotary_embedding_torch import RotaryEmbedding, broadcat
+from celle.utils import exists, default, cast_tuple
+# https://arxiv.org/abs/2103.17239
+class LayerScale(nn.Module):
+    def __init__(self, dim, depth, fn):
+        super().__init__()
+        if depth <= 18:
+            init_eps = 0.1
+        elif depth > 18 and depth <= 24:
+            init_eps = 1e-5
+        else:
+            init_eps = 1e-6
+        scale = torch.zeros(1, 1, dim).fill_(init_eps)
+        self.scale = nn.Parameter(scale)
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) * self.scale
+# layer norm
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.norm_out = nn.Identity()
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        x = self.norm(x)
+        x = self.fn(x, **kwargs)
+        return self.norm_out(x)
+# feed forward
+class GEGLU(nn.Module):
+    def forward(self, x):
+        x, gates = x.chunk(2, dim=-1)
+        return x * F.gelu(gates)
+class FeedForward(nn.Module):
+    def __init__(self, dim, dropout=0.0, mult=4.0):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, dim * mult * 2),
+            GEGLU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim * mult, dim),
+        )
+    def forward(self, x):
+        return self.net(x)
+# main transformer class
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        seq_len,
+        causal=True,
+        heads=8,
+        dim_head=64,
+        ff_mult=4,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        image_fmap_size=None,
+        num_images=None,
+        stable=False,
+        rotary_emb=True,
+    ):
+        super().__init__()
+        layers = nn.ModuleList([])
+        self.seq_len = seq_len
+        self.image_fmap_size = image_fmap_size
+        for ind in range(depth):
+            attn_class = partial(Attention, stable=stable)
+            attn = attn_class(
+                dim,
+                causal=causal,
+                seq_len=seq_len,
+                heads=heads,
+                dim_head=dim_head,
+                dropout=attn_dropout,
+            )
+            ff = FeedForward(dim, mult=ff_mult, dropout=ff_dropout)
+            layers.append(
+                nn.ModuleList(
+                    [
+                        LayerScale(
+                            dim, ind + 1, PreNorm(dim, attn)
+                        ),
+                        LayerScale(
+                            dim, ind + 1, PreNorm(dim, ff)
+                        ),
+                    ]
+                )
+            )
+        # pairs arguments with attention layer
+        route_attn = ((True, False),) * depth
+        attn_route_map = {
+            "mask": route_attn,
+            "rotary_pos_emb": route_attn,
+        }
+        self.layers = SequentialSequence(layers, args_route=attn_route_map)
+        # generate positional embeddings for rotary
+        pos_emb = None
+        if rotary_emb:
+            rot_dim = dim_head // 3
+            img_seq_len = ((image_fmap_size // num_images) ** 2) * num_images
+            text_len = seq_len - img_seq_len + 1
+            text_pos_emb = RotaryEmbedding(dim=rot_dim)
+            img_axial_pos_emb = RotaryEmbedding(dim=rot_dim, freqs_for="pixel")
+            text_freqs = text_pos_emb(torch.arange(text_len))
+            img_to_text_freqs = text_pos_emb(
+                torch.full((img_seq_len,), 8192)
+            )  # image is given a position far away from text
+            text_freqs = torch.cat((text_freqs, img_to_text_freqs), dim=0)
+            img_freqs_axial = img_axial_pos_emb(
+                torch.linspace(-1, 1, steps=image_fmap_size)
+            )
+            if num_images > 1:
+                split_img_freqs_axial = torch.split(
+                    img_freqs_axial, image_fmap_size // num_images, dim=0
+                )
+                split_img_freqs = [
+                    broadcat(
+                        (
+                            rearrange(img_freqs_axial_per_image, "i d -> i () d"),
+                            rearrange(img_freqs_axial_per_image, "j d -> () j d"),
+                        ),
+                        dim=-1,
+                    )
+                    for img_freqs_axial_per_image in split_img_freqs_axial
+                ]
+                split_img_freqs = [
+                    rearrange(img_freqs_per_image, "h w d -> (h w) d")
+                    for img_freqs_per_image in split_img_freqs
+                ]
+                # concat per image-image_freqs
+                img_freqs = torch.cat(split_img_freqs, dim=0)
+            elif num_images == 1:
+                img_freqs = broadcat(
+                    (
+                        rearrange(img_freqs_axial, "i d -> i () d"),
+                        rearrange(img_freqs_axial, "j d -> () j d"),
+                    ),
+                    dim=-1,
+                )
+                img_freqs = rearrange(img_freqs, "h w d -> (h w) d")
+            else:
+                assert False, "num_images must be int greater than 0"
+            self.img_axial_pos_emb = img_axial_pos_emb
+            self.text_pos_emb = text_pos_emb
+            text_axial_freqs = img_axial_pos_emb(
+                torch.full((text_len,), -10.0)
+            )  # text is given a position of -10 apart from the image axial positions, which is from range [-1, 1]
+            text_axial_freqs = torch.cat((text_axial_freqs, text_axial_freqs), dim=-1)
+            img_freqs = torch.cat((text_axial_freqs, img_freqs), dim=0)
+            pos_emb = torch.cat((text_freqs, img_freqs), dim=-1)
+            pos_emb = rearrange(pos_emb, "n d -> () n d")
+        self.register_buffer("pos_emb", pos_emb)
+    def forward(self, x, **kwargs):
+        return self.layers(x, rotary_pos_emb=self.pos_emb, **kwargs)

celle/utils.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import torch
+from torchvision import transforms
+from math import pi
+import torchvision.transforms.functional as TF
+# Define helper functions
+def exists(val):
+    """Check if a variable exists"""
+    return val is not None
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+def default(val, d):
+    """If a value exists, return it; otherwise, return a default value"""
+    return val if exists(val) else d
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+def cast_tuple(val, depth=1):
+    if isinstance(val, list):
+        val = tuple(val)
+    return val if isinstance(val, tuple) else (val,) * depth
+def is_empty(t):
+    """Check if a tensor is empty"""
+    # Return True if the number of elements in the tensor is zero, else False
+    return t.nelement() == 0
+def masked_mean(t, mask, dim=1):
+    """
+    Compute the mean of a tensor, masked by a given mask
+    Args:
+        t (torch.Tensor): input tensor of shape (batch_size, seq_len, hidden_dim)
+        mask (torch.Tensor): mask tensor of shape (batch_size, seq_len)
+        dim (int): dimension along which to compute the mean (default=1)
+    Returns:
+        torch.Tensor: masked mean tensor of shape (batch_size, hidden_dim)
+    """
+    t = t.masked_fill(~mask[:, :, None], 0.0)
+    return t.sum(dim=1) / mask.sum(dim=1)[..., None]
+def set_requires_grad(model, value):
+    """
+    Set whether or not the model's parameters require gradients
+    Args:
+        model (torch.nn.Module): the PyTorch model to modify
+        value (bool): whether or not to require gradients
+    """
+    for param in model.parameters():
+        param.requires_grad = value
+def eval_decorator(fn):
+    """
+    Decorator function to evaluate a given function
+    Args:
+        fn (callable): function to evaluate
+    Returns:
+        callable: the decorated function
+    """
+    def inner(model, *args, **kwargs):
+        was_training = model.training
+        model.eval()
+        out = fn(model, *args, **kwargs)
+        model.train(was_training)
+        return out
+    return inner
+def log(t, eps=1e-20):
+    """
+    Compute the natural logarithm of a tensor
+    Args:
+        t (torch.Tensor): input tensor
+        eps (float): small value to add to prevent taking the log of 0 (default=1e-20)
+    Returns:
+        torch.Tensor: the natural logarithm of the input tensor
+    """
+    return torch.log(t + eps)
+def gumbel_noise(t):
+    """
+    Generate Gumbel noise
+    Args:
+        t (torch.Tensor): input tensor
+    Returns:
+        torch.Tensor: a tensor of Gumbel noise with the same shape as the input tensor
+    """
+    noise = torch.zeros_like(t).uniform_(0, 1)
+    return -log(-log(noise))
+def gumbel_sample(t, temperature=0.9, dim=-1):
+    """
+    Sample from a Gumbel-softmax distribution
+    Args:
+        t (torch.Tensor): input tensor of shape (batch_size, num_classes)
+        temperature (float): temperature for the Gumbel-softmax distribution (default=0.9)
+        dim (int): dimension along which to sample (default=-1)
+    Returns:
+        torch.Tensor: a tensor of samples from the Gumbel-softmax distribution with the same shape as the input tensor
+    """
+    return (t / max(temperature, 1e-10)) + gumbel_noise(t)
+def top_k(logits, thres=0.5):
+    """
+    Return a tensor where all but the top k values are set to negative infinity
+    Args:
+        logits (torch.Tensor): input tensor of shape (batch_size, num_classes)
+        thres (float): threshold for the top k values (default=0.5)
+    Returns:
+        torch.Tensor: a tensor with the same shape as the input tensor, where all but the top k values are set to negative infinity
+    """
+    num_logits = logits.shape[-1]
+    k = max(int((1 - thres) * num_logits), 1)
+    val, ind = torch.topk(logits, k)
+    probs = torch.full_like(logits, float("-inf"))
+    probs.scatter_(-1, ind, val)
+    return probs
+def gamma_func(mode="cosine", scale=0.15):
+    """Return a function that takes a single input r and returns a value based on the selected mode"""
+    # Define a different function based on the selected mode
+    if mode == "linear":
+        return lambda r: 1 - r
+    elif mode == "cosine":
+        return lambda r: torch.cos(r * pi / 2)
+    elif mode == "square":
+        return lambda r: 1 - r**2
+    elif mode == "cubic":
+        return lambda r: 1 - r**3
+    elif mode == "scaled-cosine":
+        return lambda r: scale * (torch.cos(r * pi / 2))
+    else:
+        # Raise an error if the selected mode is not implemented
+        raise NotImplementedError
+class always:
+    """Helper class to always return a given value"""
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, x, *args, **kwargs):
+        return self.val
+class DivideMax(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        maxes = x.amax(dim=self.dim, keepdim=True).detach()
+        return x / maxes
+def replace_outliers(image, percentile=0.0001):
+    lower_bound, upper_bound = torch.quantile(image, percentile), torch.quantile(
+        image, 1 - percentile
+    )
+    mask = (image <= upper_bound) & (image >= lower_bound)
+    valid_pixels = image[mask]
+    image[~mask] = torch.clip(image[~mask], min(valid_pixels), max(valid_pixels))
+    return image
+def process_image(image, dataset, image_type=None):
+    image = TF.to_tensor(image).unsqueeze(0)
+    if dataset == "HPA":
+        if image_type == 'nucleus':
+            normalize = (0.0655, 0.0650)
+        elif image_type == 'protein':
+            normalize = (0.1732, 0.1208)
+    elif dataset == "OpenCell":
+        if image_type == 'nucleus':
+            normalize = (0.0272, 0.0244)
+        elif image_type == 'protein':
+            normalize = (0.0486, 0.0671)
+    t_forms = []
+    t_forms.append(transforms.RandomCrop(256))
+    # t_forms.append(transforms.Normalize(normalize[0],normalize[1]))
+    image = transforms.Compose(t_forms)(image)
+    return image

celle/vae.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from math import sqrt, log
+from omegaconf import OmegaConf
+import importlib
+import torch
+from torch import nn
+import torch.nn.functional as F
+from einops import rearrange
+# helpers methods
+def load_model(path):
+    with open(path, "rb") as f:
+        return torch.load(f, map_location=torch.device("cpu"))
+def map_pixels(x, eps=0.1):
+    return (1 - 2 * eps) * x + eps
+def unmap_pixels(x, eps=0.1):
+    return torch.clamp((x - eps) / (1 - 2 * eps), 0, 1)
+def make_contiguous(module):
+    with torch.no_grad():
+        for param in module.parameters():
+            param.set_(param.contiguous())
+# VQGAN from Taming Transformers paper
+# https://arxiv.org/abs/2012.09841
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def instantiate_from_config(config):
+    if not "target" in config:
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+class VQGanVAE(nn.Module):
+    def __init__(self, vqgan_model_path=None, vqgan_config_path=None, channels=1):
+        super().__init__()
+        assert vqgan_config_path is not None
+        model_path = vqgan_model_path
+        config_path = vqgan_config_path
+        config = OmegaConf.load(config_path)
+        model = instantiate_from_config(config["model"])
+        if vqgan_model_path:
+            state = torch.load(model_path, map_location="cpu")["state_dict"]
+            model.load_state_dict(state, strict=True)
+        print(f"Loaded VQGAN from {model_path} and {config_path}")
+        self.model = model
+        # f as used in https://github.com/CompVis/taming-transformers#overview-of-pretrained-models
+        f = (
+            config.model.params.ddconfig.resolution
+            / config.model.params.ddconfig.attn_resolutions[0]
+        )
+        self.num_layers = int(log(f) / log(2))
+        self.image_size = config.model.params.ddconfig.resolution
+        self.num_tokens = config.model.params.n_embed
+        # self.is_gumbel = isinstance(self.model, GumbelVQ)
+        self.is_gumbel = False
+        self.channels = config.model.params.ddconfig.in_channels
+    def encode(self, img):
+        return self.model.encode(img)
+    def get_codebook_indices(self, img):
+        b = img.shape[0]
+        # img = (2 * img) - 1
+        _, _, [_, _, indices] = self.encode(img)
+        if self.is_gumbel:
+            return rearrange(indices, "b h w -> b (h w)", b=b)
+        return rearrange(indices, "(b n) -> b n", b=b)
+    def decode(self, img_seq):
+        b, n = img_seq.shape
+        one_hot_indices = F.one_hot(img_seq, num_classes=self.num_tokens).float()
+        z = (
+            one_hot_indices @ self.model.quantize.embed.weight
+            if self.is_gumbel
+            else (one_hot_indices @ self.model.quantize.embedding.weight)
+        )
+        z = rearrange(z, "b (h w) c -> b c h w", h=int(sqrt(n)))
+        img = self.model.decode(z)
+        # img = (img.clamp(-1.0, 1.0) + 1) * 0.5
+        return img
+    def forward(self, img, optimizer_idx=1):
+        return self.model.training_step(img, optimizer_idx=optimizer_idx)

celle_main.py ADDED Viewed

	@@ -0,0 +1,619 @@

+import os
+import numpy as np
+import torch
+import torch.random
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+import pytorch_lightning as pl
+from pytorch_lightning import seed_everything
+from pytorch_lightning.trainer import Trainer
+from dataloader import CellLoader
+from celle import VQGanVAE, CELLE
+from omegaconf import OmegaConf
+import argparse, os, sys, datetime, glob
+from celle.celle import gumbel_sample, top_k
+torch.random.manual_seed(42)
+np.random.seed(42)
+from celle_taming_main import (
+    instantiate_from_config,
+    nondefault_trainer_args,
+    get_parser,
+)
+class CellDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        data_csv,
+        dataset,
+        sequence_mode="standard",
+        vocab="bert",
+        crop_size=256,
+        resize=600,
+        batch_size=1,
+        threshold="median",
+        text_seq_len=1000,
+        num_workers=1,
+        **kwargs,
+    ):
+        super().__init__()
+        self.data_csv = data_csv
+        self.dataset = dataset
+        self.protein_sequence_length = 0
+        self.image_folders = []
+        self.crop_size = crop_size
+        self.resize = resize
+        self.batch_size = batch_size
+        self.sequence_mode = sequence_mode
+        self.threshold = threshold
+        self.text_seq_len = int(text_seq_len)
+        self.vocab = vocab
+        self.num_workers = num_workers if num_workers is not None else batch_size * 2
+    def setup(self, stage=None):
+        # called on every GPU
+        self.cell_dataset_train = CellLoader(
+            data_csv=self.data_csv,
+            dataset=self.dataset,
+            crop_size=self.crop_size,
+            resize=self.resize,
+            split_key="train",
+            crop_method="random",
+            sequence_mode=self.sequence_mode,
+            vocab=self.vocab,
+            text_seq_len=self.text_seq_len,
+            threshold=self.threshold,
+        )
+        self.cell_dataset_val = CellLoader(
+            data_csv=self.data_csv,
+            dataset=self.dataset,
+            crop_size=self.crop_size,
+            resize=self.resize,
+            crop_method="center",
+            split_key="val",
+            sequence_mode=self.sequence_mode,
+            vocab=self.vocab,
+            text_seq_len=self.text_seq_len,
+            threshold=self.threshold,
+        )
+    def prepare_data(self):
+        pass
+    def train_dataloader(self):
+        return DataLoader(
+            self.cell_dataset_train,
+            num_workers=self.num_workers,
+            shuffle=True,
+            batch_size=self.batch_size,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.cell_dataset_val,
+            num_workers=self.num_workers,
+            batch_size=self.batch_size,
+        )
+    # def test_dataloader(self):
+    #    transforms = ...
+    #    return DataLoader(self.test, batch_size=64)
+class CELLE_trainer(pl.LightningModule):
+    def __init__(
+        self,
+        vqgan_model_path,
+        vqgan_config_path,
+        ckpt_path=None,
+        image_key="threshold",
+        condition_model_path=None,
+        condition_config_path=None,
+        num_images=2,
+        dim=2,
+        num_text_tokens=30,
+        text_seq_len=1000,
+        depth=16,
+        heads=16,
+        dim_head=64,
+        attn_dropout=0.1,
+        ff_dropout=0.1,
+        attn_types="full",
+        loss_img_weight=7,
+        stable=False,
+        rotary_emb=True,
+        text_embedding="bert",
+        fixed_embedding=True,
+        loss_cond_weight=1,
+        learning_rate=3e-4,
+        monitor="val_loss",
+    ):
+        super().__init__()
+        vae = VQGanVAE(
+            vqgan_model_path=vqgan_model_path, vqgan_config_path=vqgan_config_path
+        )
+        self.image_key = image_key
+        if condition_config_path:
+            condition_vae = VQGanVAE(
+                vqgan_model_path=condition_model_path,
+                vqgan_config_path=condition_config_path,
+            )
+        else:
+            condition_vae = None
+        self.celle = CELLE(
+            dim=dim,
+            vae=vae,  # automatically infer (1) image sequence length and (2) number of image tokens
+            condition_vae=condition_vae,
+            num_images=num_images,
+            num_text_tokens=num_text_tokens,  # vocab size for text
+            text_seq_len=text_seq_len,  # text sequence length
+            depth=depth,  # should aim to be 64
+            heads=heads,  # attention heads
+            dim_head=dim_head,  # attention head dimension
+            attn_dropout=attn_dropout,  # attention dropout
+            ff_dropout=ff_dropout,  # feedforward dropout
+            loss_img_weight=loss_img_weight,
+            stable=stable,
+            rotary_emb=rotary_emb,
+            text_embedding=text_embedding,
+            fixed_embedding=fixed_embedding,
+            loss_cond_weight=loss_cond_weight,
+        )
+        self.learning_rate = learning_rate
+        self.num_text_tokens = num_text_tokens
+        self.num_images = num_images
+        if monitor is not None:
+            self.monitor = monitor
+        ignore_keys = []
+        if condition_model_path:
+            ignore_keys.append("celle.condition_vae")
+        if vqgan_model_path:
+            ignore_keys.append("celle.vae")
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        ckpt = sd.copy()
+        for k in sd.keys():
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    # print("Deleting key {} from state_dict.".format(k))
+                    del ckpt[k]
+        self.load_state_dict(ckpt, strict=True)
+        print(f"Restored from {path}")
+    def forward(self, text, condition, target, return_loss=True):
+        return self.celle(
+            text=text, condition=condition, image=target, return_loss=return_loss
+        )
+    def get_input(self, batch):
+        text = batch["sequence"].squeeze(1)
+        condition = batch["nucleus"]
+        target = batch[self.image_key]
+        return text, condition, target
+    def get_image_from_logits(self, logits, temperature=0.9):
+        filtered_logits = top_k(logits, thres=0.5)
+        sample = gumbel_sample(filtered_logits, temperature=temperature, dim=-1)
+        self.celle.vae.eval()
+        out = self.celle.vae.decode(
+            sample[:, self.celle.text_seq_len + self.celle.condition_seq_len :]
+            - (self.celle.num_text_tokens + self.celle.num_condition_tokens)
+        )
+        return out
+    def get_loss(self, text, condition, target):
+        loss_dict = {}
+        loss, loss_dict, logits = self(text, condition, target, return_loss=True)
+        return loss, loss_dict
+    def total_loss(
+        self,
+        loss,
+        loss_dict,
+        mode="train",
+    ):
+        loss_dict = {f"{mode}/{key}": value for key, value in loss_dict.items()}
+        for key, value in loss_dict.items():
+            self.log(
+                key,
+                value,
+                prog_bar=True,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=True,
+            )
+        return loss
+    def training_step(self, batch, batch_idx):
+        text, condition, target = self.get_input(batch)
+        loss, log_dict = self.get_loss(text, condition, target)
+        loss = self.total_loss(loss, log_dict, mode="train")
+        return loss
+    def validation_step(self, batch, batch_idx):
+        with torch.no_grad():
+            text, condition, target = self.get_input(batch)
+            loss, log_dict = self.get_loss(text, condition, target)
+            loss = self.total_loss(loss, log_dict, mode="val")
+        return loss
+    def configure_optimizers(self):
+        optimizer = AdamW(self.parameters(), lr=self.learning_rate, betas=(0.9, 0.95))
+        return optimizer
+    def scale_image(self, image):
+        for tensor in image:
+            if torch.min(tensor) < 0:
+                tensor += -torch.min(tensor)
+            else:
+                tensor -= torch.min(tensor)
+            tensor /= torch.max(tensor)
+        return image
+    @torch.no_grad()
+    def log_images(self, batch, **kwargs):
+        log = []
+        text, condition, target = self.get_input(batch)
+        text = text.squeeze(1).to(self.device)
+        condition = condition.to(self.device)
+        out = self.celle.generate_images(text=text, condition=condition)
+        log["condition"] = self.scale_image(condition)
+        log["output"] = self.scale_image(out)
+        if self.image_key == "threshold":
+            log["threshold"] = self.scale_image(target)
+            log["target"] = self.scale_image(batch["target"])
+        else:
+            log["target"] = self.scale_image(target)
+        return log
+# from https://github.com/CompVis/taming-transformers/blob/master/celle_main.py
+if __name__ == "__main__":
+    # custom parser to specify config files, train, test and debug mode,
+    # postfix, resume.
+    # `--key value` arguments are interpreted as arguments to the trainer.
+    # `nested.key=value` arguments are interpreted as config parameters.
+    # configs are merged from left-to-right followed by command line parameters.
+    # model:
+    #   learning_rate: float
+    #   target: path to lightning module
+    #   params:
+    #       key: value
+    # data:
+    #   target: celle_main.DataModuleFromConfig
+    #   params:
+    #      batch_size: int
+    #      wrap: bool
+    #      train:
+    #          target: path to train dataset
+    #          params:
+    #              key: value
+    #      validation:
+    #          target: path to validation dataset
+    #          params:
+    #              key: value
+    #      test:
+    #          target: path to test dataset
+    #          params:
+    #              key: value
+    # lightning: (optional, has sane defaults and can be specified on cmdline)
+    #   trainer:
+    #       additional arguments to trainer
+    #   logger:
+    #       logger to instantiate
+    #   modelcheckpoint:
+    #       modelcheckpoint to instantiate
+    #   callbacks:
+    #       callback1:
+    #           target: importpath
+    #           params:
+    #               key: value
+    now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+    # add cwd for convenience and to make classes in this file available when
+    # running as `python celle_main.py`
+    # (in particular `celle_main.DataModuleFromConfig`)
+    sys.path.append(os.getcwd())
+    parser = get_parser()
+    parser = Trainer.add_argparse_args(parser)
+    opt, unknown = parser.parse_known_args()
+    if opt.name and opt.resume:
+        raise ValueError(
+            "-n/--name and -r/--resume cannot be specified both."
+            "If you want to resume training in a new log folder, "
+            "use -n/--name in combination with --resume_from_checkpoint"
+        )
+    if opt.resume:
+        if not os.path.exists(opt.resume):
+            raise ValueError("Cannot find {}".format(opt.resume))
+        if os.path.isfile(opt.resume):
+            paths = opt.resume.split("/")
+            idx = len(paths) - paths[::-1].index("logs") + 1
+            logdir = "/".join(paths[:idx])
+            ckpt = opt.resume
+        else:
+            assert os.path.isdir(opt.resume), opt.resume
+            logdir = opt.resume.rstrip("/")
+            ckpt = os.path.join(logdir, "checkpoints", "last.ckpt")
+        opt.resume_from_checkpoint = ckpt
+        base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*.yaml")))
+        opt.base = base_configs + opt.base
+        _tmp = logdir.split("/")
+        nowname = _tmp[_tmp.index("logs") + 1]
+    else:
+        if opt.name:
+            name = "_" + opt.name
+        elif opt.base:
+            cfg_fname = os.path.split(opt.base[0])[-1]
+            cfg_name = os.path.splitext(cfg_fname)[0]
+            name = "_" + cfg_name
+        else:
+            name = ""
+        nowname = now + name + opt.postfix
+        logdir = os.path.join("logs", nowname)
+    ckptdir = os.path.join(logdir, "checkpoints")
+    cfgdir = os.path.join(logdir, "configs")
+    seed_everything(opt.seed)
+    try:
+        # init and save configs
+        configs = [OmegaConf.load(cfg) for cfg in opt.base]
+        cli = OmegaConf.from_dotlist(unknown)
+        config = OmegaConf.merge(*configs, cli)
+        lightning_config = config.pop("lightning", OmegaConf.create())
+        # merge trainer cli with config
+        trainer_config = lightning_config.get("trainer", OmegaConf.create())
+        # default to ddp
+        # trainer_config["distributed_backend"] = "ddp"
+        for k in nondefault_trainer_args(opt):
+            trainer_config[k] = getattr(opt, k)
+        if not "gpus" in trainer_config:
+            del trainer_config["distributed_backend"]
+            cpu = True
+        else:
+            gpuinfo = trainer_config["gpus"]
+            print(f"Running on GPUs {gpuinfo}")
+            cpu = False
+        trainer_opt = argparse.Namespace(**trainer_config)
+        lightning_config.trainer = trainer_config
+        # model
+        # model = instantiate_from_config(config.model)
+        model = instantiate_from_config(config.model)
+        # trainer and callbacks
+        trainer_kwargs = dict()
+        # default logger configs
+        # NOTE wandb < 0.10.0 interferes with shutdown
+        # wandb >= 0.10.0 seems to fix it but still interferes with pudb
+        # debugging (wrongly sized pudb ui)
+        # thus prefer testtube for now
+        default_logger_cfgs = {
+            "wandb": {
+                "target": "pytorch_lightning.loggers.WandbLogger",
+                "params": {
+                    "name": nowname,
+                    "save_dir": logdir,
+                    "offline": opt.debug,
+                    "id": nowname,
+                },
+            },
+            "testtube": {
+                # "target": "pytorch_lightning.loggers.TestTubeLogger",
+                "target": "pytorch_lightning.loggers.TensorBoardLogger",
+                "params": {
+                    "name": "testtube",
+                    "save_dir": logdir,
+                },
+            },
+        }
+        default_logger_cfg = default_logger_cfgs["testtube"]
+        # logger_cfg = lightning_config.logger or OmegaConf.create()
+        try:
+            logger_cfg = lightning_config.logger
+        except:
+            logger_cfg = OmegaConf.create()
+        logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg)
+        trainer_kwargs["logger"] = instantiate_from_config(logger_cfg)
+        # modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to
+        # specify which metric is used to determine best models
+        default_modelckpt_cfg = {
+            "checkpoint_callback": {
+                "target": "pytorch_lightning.callbacks.ModelCheckpoint",
+                "params": {
+                    "dirpath": ckptdir,
+                    "filename": "{epoch:06}",
+                    "verbose": True,
+                    "save_last": True,
+                },
+            }
+        }
+        if hasattr(model, "monitor"):
+            print(f"Monitoring {model.monitor} as checkpoint metric.")
+            default_modelckpt_cfg["checkpoint_callback"]["params"][
+                "monitor"
+            ] = model.monitor
+            default_modelckpt_cfg["checkpoint_callback"]["params"]["save_top_k"] = 3
+        try:
+            modelckpt_cfg = lightning_config.modelcheckpoint
+        except:
+            modelckpt_cfg = OmegaConf.create()
+        modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg)
+        # trainer_kwargs["checkpoint_callback"] = instantiate_from_config(modelckpt_cfg)
+        # add callback which sets up log directory
+        default_callbacks_cfg = {
+            "setup_callback": {
+                "target": "celle_taming_main.SetupCallback",
+                "params": {
+                    "resume": opt.resume,
+                    "now": now,
+                    "logdir": logdir,
+                    "ckptdir": ckptdir,
+                    "cfgdir": cfgdir,
+                    "config": config,
+                    "lightning_config": lightning_config,
+                },
+            },
+            # "image_logger": {
+            #     "target": "celle_taming_main.ImageLogger",
+            #     "params": {
+            #         "batch_frequency": 0,
+            #         "max_images": 0,
+            #         "clamp": False,
+            #         "increase_log_steps": False,
+            #     },
+            # },
+            # "learning_rate_logger": {
+            #     "target": "celle_taming_main.LearningRateMonitor",
+            #     "params": {
+            #         "logging_interval": "step",
+            #         # "log_momentum": True
+            # },
+            # },
+        }
+        try:
+            callbacks_cfg = lightning_config.callbacks
+        except:
+            callbacks_cfg = OmegaConf.create()
+        callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg)
+        callbacks_cfg = OmegaConf.merge(modelckpt_cfg, callbacks_cfg)
+        trainer_kwargs["callbacks"] = [
+            instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg
+        ]
+        trainer = Trainer.from_argparse_args(
+            trainer_opt, **trainer_kwargs, profiler="simple"
+        )
+        # data
+        data = instantiate_from_config(config.data)
+        # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
+        # calling these ourselves should not be necessary but it is.
+        # lightning still takes care of proper multiprocessing though
+        data.setup()
+        data.prepare_data()
+        # configure learning rate
+        bs, lr = config.data.params.batch_size, config.model.learning_rate
+        if not cpu:
+            ngpu = len(lightning_config.trainer.gpus.strip(",").split(","))
+        else:
+            ngpu = 1
+        try:
+            accumulate_grad_batches = lightning_config.trainer.accumulate_grad_batches
+        except:
+            accumulate_grad_batches = 1
+        print(f"accumulate_grad_batches = {accumulate_grad_batches}")
+        lightning_config.trainer.accumulate_grad_batches = accumulate_grad_batches
+        model.learning_rate = accumulate_grad_batches * ngpu * bs * lr
+        print(
+            "Setting learning rate to {:.2e} = {} (accumulate_grad_batches) * {} (num_gpus) * {} (batchsize) * {:.2e} (lr)".format(
+                model.learning_rate, accumulate_grad_batches, ngpu, bs, lr
+            )
+        )
+        # allow checkpointing via USR1
+        def melk(*args, **kwargs):
+            # run all checkpoint hooks
+            if trainer.global_rank == 0:
+                print("Summoning checkpoint.")
+                ckpt_path = os.path.join(ckptdir, "last.ckpt")
+                trainer.save_checkpoint(ckpt_path)
+        def divein(*args, **kwargs):
+            if trainer.global_rank == 0:
+                import pudb
+                pudb.set_trace()
+        import signal
+        signal.signal(signal.SIGUSR1, melk)
+        signal.signal(signal.SIGUSR2, divein)
+        # run
+        if opt.train:
+            try:
+                # model = torch.compile(model, mode="reduce_overhead")
+                torch.compile(trainer.fit(model, data), mode="max-autotune")
+            except Exception:
+                melk()
+                raise
+        if not opt.no_test and not trainer.interrupted:
+            trainer.test(model, data)
+    except Exception:
+        if opt.debug and trainer.global_rank == 0:
+            try:
+                import pudb as debugger
+            except ImportError:
+                import pdb as debugger
+            debugger.post_mortem()
+        raise
+    finally:
+        # move newly created debug project to debug_runs
+        if opt.debug and not opt.resume and trainer.global_rank == 0:
+            dst, name = os.path.split(logdir)
+            dst = os.path.join(dst, "debug_runs", name)
+            os.makedirs(os.path.split(dst)[0], exist_ok=True)
+            os.rename(logdir, dst)

celle_taming_main.py ADDED Viewed

	@@ -0,0 +1,695 @@

+import argparse, os, sys, datetime, glob, importlib
+from omegaconf import OmegaConf
+import numpy as np
+from PIL import Image
+import torch
+import torchvision
+from torch.utils.data import DataLoader, Dataset
+from dataloader import CellLoader
+from pytorch_lightning.callbacks import ModelCheckpoint, Callback, LearningRateMonitor
+import pytorch_lightning as pl
+from pytorch_lightning import seed_everything
+from pytorch_lightning.trainer import Trainer
+from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.utilities import rank_zero_only
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def get_parser(**parser_kwargs):
+    def str2bool(v):
+        if isinstance(v, bool):
+            return v
+        if v.lower() in ("yes", "true", "t", "y", "1"):
+            return True
+        elif v.lower() in ("no", "false", "f", "n", "0"):
+            return False
+        else:
+            raise argparse.ArgumentTypeError("Boolean value expected.")
+    parser = argparse.ArgumentParser(**parser_kwargs)
+    parser.add_argument(
+        "-n",
+        "--name",
+        type=str,
+        const=True,
+        default="",
+        nargs="?",
+        help="postfix for logdir",
+    )
+    parser.add_argument(
+        "-r",
+        "--resume",
+        type=str,
+        const=True,
+        default="",
+        nargs="?",
+        help="resume from logdir or checkpoint in logdir",
+    )
+    parser.add_argument(
+        "-b",
+        "--base",
+        nargs="*",
+        metavar="base_config.yaml",
+        help="paths to base configs. Loaded from left-to-right. "
+        "Parameters can be overwritten or added with command-line options of the form `--key value`.",
+        default=list(),
+    )
+    parser.add_argument(
+        "-t",
+        "--train",
+        type=str2bool,
+        const=True,
+        default=False,
+        nargs="?",
+        help="train",
+    )
+    parser.add_argument(
+        "--no-test",
+        type=str2bool,
+        const=True,
+        default=False,
+        nargs="?",
+        help="disable test",
+    )
+    parser.add_argument(
+        "-p", "--project", help="name of new or path to existing project"
+    )
+    parser.add_argument(
+        "-d",
+        "--debug",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="enable post-mortem debugging",
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=42,
+        help="seed for seed_everything",
+    )
+    parser.add_argument(
+        "-f",
+        "--postfix",
+        type=str,
+        default="",
+        help="post-postfix for default name",
+    )
+    return parser
+def nondefault_trainer_args(opt):
+    parser = argparse.ArgumentParser()
+    parser = Trainer.add_argparse_args(parser)
+    args = parser.parse_args([])
+    return sorted(k for k in vars(args) if getattr(opt, k) != getattr(args, k))
+def instantiate_from_config(config):
+    if not "target" in config:
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+class WrappedDataset(Dataset):
+    """Wraps an arbitrary object with __len__ and __getitem__ into a pytorch dataset"""
+    def __init__(self, dataset):
+        self.data = dataset
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.data[idx]
+class DataModuleFromConfig(pl.LightningDataModule):
+    def __init__(
+        self,
+        data_csv,
+        dataset,
+        crop_size=256,
+        resize=600,
+        batch_size=1,
+        sequence_mode="latent",
+        vocab="bert",
+        text_seq_len=0,
+        num_workers=1,
+        threshold=False,
+        train=True,
+        validation=True,
+        test=None,
+        wrap=False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.data_csv = data_csv
+        self.dataset = dataset
+        self.image_folders = []
+        self.crop_size = crop_size
+        self.resize = resize
+        self.batch_size = batch_size
+        self.sequence_mode = sequence_mode
+        self.threshold = threshold
+        self.text_seq_len = int(text_seq_len)
+        self.vocab = vocab
+        self.dataset_configs = dict()
+        self.num_workers = num_workers if num_workers is not None else batch_size * 2
+        if train is not None:
+            self.dataset_configs["train"] = train
+            self.train_dataloader = self._train_dataloader
+        if validation is not None:
+            self.dataset_configs["validation"] = validation
+            self.val_dataloader = self._val_dataloader
+        if test is not None:
+            self.dataset_configs["test"] = test
+            self.test_dataloader = self._test_dataloader
+        self.wrap = wrap
+    def prepare_data(self):
+        pass
+    def setup(self, stage=None):
+        # called on every GPU
+        self.cell_dataset_train = CellLoader(
+            data_csv=self.data_csv,
+            dataset=self.dataset,
+            crop_size=self.crop_size,
+            split_key="train",
+            crop_method="random",
+            sequence_mode=None,
+            vocab=self.vocab,
+            text_seq_len=self.text_seq_len,
+            threshold=self.threshold,
+        )
+        self.cell_dataset_val = CellLoader(
+            data_csv=self.data_csv,
+            dataset=self.dataset,
+            crop_size=self.crop_size,
+            split_key="val",
+            crop_method="center",
+            sequence_mode=None,
+            vocab=self.vocab,
+            text_seq_len=self.text_seq_len,
+            threshold=self.threshold,
+        )
+    def _train_dataloader(self):
+        return DataLoader(
+            self.cell_dataset_train,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            shuffle=True,
+            batch_size=self.batch_size,
+        )
+    def _val_dataloader(self):
+        return DataLoader(
+            self.cell_dataset_val,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            batch_size=self.batch_size,
+        )
+    # def _test_dataloader(self):
+    #    return DataLoader(self.datasets["test"], batch_size=self.batch_size,
+    #                     num_workers=self.num_workers)
+class SetupCallback(Callback):
+    def __init__(self, resume, now, logdir, ckptdir, cfgdir, config, lightning_config):
+        super().__init__()
+        self.resume = resume
+        self.now = now
+        self.logdir = logdir
+        self.ckptdir = ckptdir
+        self.cfgdir = cfgdir
+        self.config = config
+        self.lightning_config = lightning_config
+    def on_fit_start(self, trainer, pl_module):
+        if trainer.global_rank == 0:
+            # Create logdirs and save configs
+            os.makedirs(self.logdir, exist_ok=True)
+            os.makedirs(self.ckptdir, exist_ok=True)
+            os.makedirs(self.cfgdir, exist_ok=True)
+            print("Project config")
+            print(OmegaConf.to_yaml(self.config))
+            OmegaConf.save(
+                self.config,
+                os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)),
+            )
+            print("Lightning config")
+            print(OmegaConf.to_yaml(self.lightning_config))
+            OmegaConf.save(
+                OmegaConf.create({"lightning": self.lightning_config}),
+                os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)),
+            )
+        else:
+            # ModelCheckpoint callback created log directory --- remove it
+            if not self.resume and os.path.exists(self.logdir):
+                dst, name = os.path.split(self.logdir)
+                dst = os.path.join(dst, "child_runs", name)
+                os.makedirs(os.path.split(dst)[0], exist_ok=True)
+                try:
+                    os.rename(self.logdir, dst)
+                except FileNotFoundError:
+                    pass
+class ImageLogger(Callback):
+    def __init__(
+        self, batch_frequency, max_images, clamp=True, increase_log_steps=True
+    ):
+        super().__init__()
+        self.batch_freq = batch_frequency
+        self.max_images = max_images
+        self.logger_log_images = {
+            pl.loggers.WandbLogger: self._wandb,
+            # pl.loggers.TestTubeLogger: self._testtube,
+            pl.loggers.TensorBoardLogger: self._testtube,
+        }
+        self.log_steps = [2**n for n in range(int(np.log2(self.batch_freq)) + 1)]
+        if not increase_log_steps:
+            self.log_steps = [self.batch_freq]
+        self.clamp = clamp
+    @rank_zero_only
+    def _wandb(self, pl_module, images, batch_idx, split):
+        raise ValueError("No way wandb")
+        grids = dict()
+        for k in images:
+            grid = torchvision.utils.make_grid(images[k])
+            grids[f"{split}/{k}"] = wandb.Image(grid)
+        pl_module.logger.experiment.log(grids)
+    @rank_zero_only
+    def _testtube(self, pl_module, images, batch_idx, split):
+        for k in images:
+            images[k] -= torch.min(images[k])
+            images[k] /= torch.max(images[k])
+            grid = torchvision.utils.make_grid(images[k])
+            # grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
+            tag = f"{split}/{k}"
+            pl_module.logger.experiment.add_image(
+                tag, grid, global_step=pl_module.global_step
+            )
+    @rank_zero_only
+    def log_local(self, save_dir, split, images, global_step, current_epoch, batch_idx):
+        root = os.path.join(save_dir, "images", split)
+        for k in images:
+            images[k] -= torch.min(images[k])
+            images[k] /= torch.max(images[k])
+            grid = torchvision.utils.make_grid(images[k], nrow=4)
+            # grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
+            grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
+            grid = grid.numpy()
+            grid = (grid * 255).astype(np.uint8)
+            filename = "{}_gs-{:06}_e-{:06}_b-{:06}.png".format(
+                k, global_step, current_epoch, batch_idx
+            )
+            path = os.path.join(root, filename)
+            os.makedirs(os.path.split(path)[0], exist_ok=True)
+            Image.fromarray(grid).save(path)
+    def log_img(self, pl_module, batch, batch_idx, split="train"):
+        if (
+            self.check_frequency(batch_idx)
+            and hasattr(pl_module, "log_images")  # batch_idx % self.batch_freq == 0
+            and callable(pl_module.log_images)
+            and self.max_images > 0
+        ):
+            logger = type(pl_module.logger)
+            is_train = pl_module.training
+            if is_train:
+                pl_module.eval()
+            with torch.no_grad():
+                images = pl_module.log_images(batch, split=split)
+            for k in images:
+                N = min(images[k].shape[0], self.max_images)
+                images[k] = images[k][:N]
+                if isinstance(images[k], torch.Tensor):
+                    images[k] = images[k].detach().cpu()
+                    if self.clamp:
+                        images[k] = torch.clamp(images[k], -1.0, 1.0)
+            self.log_local(
+                pl_module.logger.save_dir,
+                split,
+                images,
+                pl_module.global_step,
+                pl_module.current_epoch,
+                batch_idx,
+            )
+            logger_log_images = self.logger_log_images.get(
+                logger, lambda *args, **kwargs: None
+            )
+            logger_log_images(pl_module, images, pl_module.global_step, split)
+            if is_train:
+                pl_module.train()
+    def check_frequency(self, batch_idx):
+        if (batch_idx % self.batch_freq) == 0 or (batch_idx in self.log_steps):
+            try:
+                self.log_steps.pop(0)
+            except IndexError:
+                pass
+            return True
+        return False
+    # def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+    # def on_train_batch_end(self, *args, **kwargs):
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        self.log_img(pl_module, batch, batch_idx, split="train")
+    def on_validation_batch_end(
+        self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx
+    ):
+        self.log_img(pl_module, batch, batch_idx, split="val")
+if __name__ == "__main__":
+    # custom parser to specify config files, train, test and debug mode,
+    # postfix, resume.
+    # `--key value` arguments are interpreted as arguments to the trainer.
+    # `nested.key=value` arguments are interpreted as config parameters.
+    # configs are merged from left-to-right followed by command line parameters.
+    # model:
+    #   base_learning_rate: float
+    #   target: path to lightning module
+    #   params:
+    #       key: value
+    # data:
+    #   target: main.DataModuleFromConfig
+    #   params:
+    #      batch_size: int
+    #      wrap: bool
+    #      train:
+    #          target: path to train dataset
+    #          params:
+    #              key: value
+    #      validation:
+    #          target: path to validation dataset
+    #          params:
+    #              key: value
+    #      test:
+    #          target: path to test dataset
+    #          params:
+    #              key: value
+    # lightning: (optional, has sane defaults and can be specified on cmdline)
+    #   trainer:
+    #       additional arguments to trainer
+    #   logger:
+    #       logger to instantiate
+    #   modelcheckpoint:
+    #       modelcheckpoint to instantiate
+    #   callbacks:
+    #       callback1:
+    #           target: importpath
+    #           params:
+    #               key: value
+    now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+    # add cwd for convenience and to make classes in this file available when
+    # running as `python main.py`
+    # (in particular `main.DataModuleFromConfig`)
+    sys.path.append(os.getcwd())
+    parser = get_parser()
+    parser = Trainer.add_argparse_args(parser)
+    opt, unknown = parser.parse_known_args()
+    if opt.name and opt.resume:
+        raise ValueError(
+            "-n/--name and -r/--resume cannot be specified both."
+            "If you want to resume training in a new log folder, "
+            "use -n/--name in combination with --resume_from_checkpoint"
+        )
+    if opt.resume:
+        if not os.path.exists(opt.resume):
+            raise ValueError("Cannot find {}".format(opt.resume))
+        if os.path.isfile(opt.resume):
+            paths = opt.resume.split("/")
+            idx = len(paths) - paths[::-1].index("logs") + 1
+            logdir = "/".join(paths[:idx])
+            ckpt = opt.resume
+        else:
+            assert os.path.isdir(opt.resume), opt.resume
+            logdir = opt.resume.rstrip("/")
+            ckpt = os.path.join(logdir, "checkpoints", "last.ckpt")
+        opt.resume_from_checkpoint = ckpt
+        base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*.yaml")))
+        opt.base = base_configs + opt.base
+        _tmp = logdir.split("/")
+        nowname = _tmp[_tmp.index("logs") + 1]
+    else:
+        if opt.name:
+            name = "_" + opt.name
+        elif opt.base:
+            cfg_fname = os.path.split(opt.base[0])[-1]
+            cfg_name = os.path.splitext(cfg_fname)[0]
+            name = "_" + cfg_name
+        else:
+            name = ""
+        nowname = now + name + opt.postfix
+        logdir = os.path.join("logs", nowname)
+    ckptdir = os.path.join(logdir, "checkpoints")
+    cfgdir = os.path.join(logdir, "configs")
+    seed_everything(opt.seed)
+    try:
+        # init and save configs
+        configs = [OmegaConf.load(cfg) for cfg in opt.base]
+        cli = OmegaConf.from_dotlist(unknown)
+        config = OmegaConf.merge(*configs, cli)
+        lightning_config = config.pop("lightning", OmegaConf.create())
+        # merge trainer cli with config
+        trainer_config = lightning_config.get("trainer", OmegaConf.create())
+        # default to ddp
+        trainer_config["distributed_backend"] = "ddp"
+        trainer_config["replace_sampler_ddp"] = False
+        trainer_config["strategy"] = "ddp"
+        trainer_config["persistent_workers"] = True
+        for k in nondefault_trainer_args(opt):
+            trainer_config[k] = getattr(opt, k)
+        if not "gpus" in trainer_config:
+            del trainer_config["distributed_backend"]
+            cpu = True
+        else:
+            gpuinfo = trainer_config["gpus"]
+            print(f"Running on GPUs {gpuinfo}")
+            cpu = False
+        trainer_opt = argparse.Namespace(**trainer_config)
+        lightning_config.trainer = trainer_config
+        # model
+        model = instantiate_from_config(config.model)
+        # trainer and callbacks
+        trainer_kwargs = dict()
+        # default logger configs
+        # NOTE wandb < 0.10.0 interferes with shutdown
+        # wandb >= 0.10.0 seems to fix it but still interferes with pudb
+        # debugging (wrongly sized pudb ui)
+        # thus prefer testtube for now
+        default_logger_cfgs = {
+            "wandb": {
+                "target": "pytorch_lightning.loggers.WandbLogger",
+                "params": {
+                    "name": nowname,
+                    "save_dir": logdir,
+                    "offline": opt.debug,
+                    "id": nowname,
+                },
+            },
+            "testtube": {
+                # "target": "pytorch_lightning.loggers.TestTubeLogger",
+                "target": "pytorch_lightning.loggers.TensorBoardLogger",
+                "params": {
+                    "name": "testtube",
+                    "save_dir": logdir,
+                },
+            },
+        }
+        default_logger_cfg = default_logger_cfgs["testtube"]
+        try:
+            logger_cfg = lightning_config.logger
+        except:
+            logger_cfg = OmegaConf.create()
+        logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg)
+        trainer_kwargs["logger"] = instantiate_from_config(logger_cfg)
+        # modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to
+        # specify which metric is used to determine best models
+        default_modelckpt_cfg = {
+            "checkpoint_callback": {
+                "target": "pytorch_lightning.callbacks.ModelCheckpoint",
+                "params": {
+                    "dirpath": ckptdir,
+                    "filename": "{epoch:06}",
+                    "verbose": True,
+                    "save_last": True,
+                },
+            }
+        }
+        if hasattr(model, "monitor"):
+            print(f"Monitoring {model.monitor} as checkpoint metric.")
+            default_modelckpt_cfg["checkpoint_callback"]["params"][
+                "monitor"
+            ] = model.monitor
+            default_modelckpt_cfg["checkpoint_callback"]["params"]["save_top_k"] = 3
+        try:
+            modelckpt_cfg = lightning_config.modelcheckpoint
+        except:
+            modelckpt_cfg = OmegaConf.create()
+        modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg)
+        # trainer_kwargs["checkpoint_callback"] = instantiate_from_config(modelckpt_cfg)
+        # loaded_model_callbacks = instantiate_from_config(modelckpt_cfg)
+        # add callback which sets up log directory
+        default_callbacks_cfg = {
+            "setup_callback": {
+                "target": "celle_taming_main.SetupCallback",
+                "params": {
+                    "resume": opt.resume,
+                    "now": now,
+                    "logdir": logdir,
+                    "ckptdir": ckptdir,
+                    "cfgdir": cfgdir,
+                    "config": config,
+                    "lightning_config": lightning_config,
+                },
+            },
+            "image_logger": {
+                "target": "celle_taming_main.ImageLogger",
+                "params": {
+                    "batch_frequency": 2000,
+                    "max_images": 10,
+                    "clamp": True,
+                    "increase_log_steps": False,
+                },
+            },
+            "learning_rate_logger": {
+                "target": "celle_taming_main.LearningRateMonitor",
+                "params": {
+                    "logging_interval": "step",
+                    # "log_momentum": True
+                },
+            },
+        }
+        try:
+            callbacks_cfg = lightning_config.callbacks
+        except:
+            callbacks_cfg = OmegaConf.create()
+        callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg)
+        callbacks_cfg = OmegaConf.merge(modelckpt_cfg, callbacks_cfg)
+        trainer_kwargs["callbacks"] = [
+            instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg
+        ]
+        # loaded_callbacks = [
+        #     instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg
+        # ]
+        # trainer_kwargs["callbacks"] = loaded_callbacks.append(loaded_model_callbacks)
+        trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs)
+        # data
+        data = instantiate_from_config(config.data)
+        # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
+        # calling these ourselves should not be necessary but it is.
+        # lightning still takes care of proper multiprocessing though
+        data.prepare_data()
+        data.setup()
+        # configure learning rate
+        bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate
+        if not cpu:
+            ngpu = len(lightning_config.trainer.gpus.strip(",").split(","))
+        else:
+            ngpu = 1
+        try:
+            accumulate_grad_batches = lightning_config.trainer.accumulate_grad_batches
+        except:
+            accumulate_grad_batches = 1
+        print(f"accumulate_grad_batches = {accumulate_grad_batches}")
+        lightning_config.trainer.accumulate_grad_batches = accumulate_grad_batches
+        model.learning_rate = accumulate_grad_batches * ngpu * bs * base_lr
+        print(
+            "Setting learning rate to {:.2e} = {} (accumulate_grad_batches) * {} (num_gpus) * {} (batchsize) * {:.2e} (base_lr)".format(
+                model.learning_rate, accumulate_grad_batches, ngpu, bs, base_lr
+            )
+        )
+        # allow checkpointing via USR1
+        def melk(*args, **kwargs):
+            # run all checkpoint hooks
+            if trainer.global_rank == 0:
+                print("Summoning checkpoint.")
+                ckpt_path = os.path.join(ckptdir, "last.ckpt")
+                trainer.save_checkpoint(ckpt_path)
+        def divein(*args, **kwargs):
+            if trainer.global_rank == 0:
+                import pudb
+                pudb.set_trace()
+        import signal
+        signal.signal(signal.SIGUSR1, melk)
+        signal.signal(signal.SIGUSR2, divein)
+        # model = torch.compile(model)
+        # run
+        if opt.train:
+            try:
+                torch.compile(trainer.fit(model, data))
+            except Exception:
+                melk()
+                raise
+        if not opt.no_test and not trainer.interrupted:
+            trainer.test(model, data)
+    except Exception:
+        if opt.debug and trainer.global_rank == 0:
+            try:
+                import pudb as debugger
+            except ImportError:
+                import pdb as debugger
+            debugger.post_mortem()
+        raise
+    finally:
+        # move newly created debug project to debug_runs
+        if opt.debug and not opt.resume and trainer.global_rank == 0:
+            dst, name = os.path.split(logdir)
+            dst = os.path.join(dst, "debug_runs", name)
+            os.makedirs(os.path.split(dst)[0], exist_ok=True)
+            os.rename(logdir, dst)

dataloader.py ADDED Viewed

	@@ -0,0 +1,308 @@

+import os
+import numpy as np
+from PIL import Image, ImageSequence
+import json
+import pandas as pd
+import torch
+from torch.utils.data import Dataset
+from torchvision import transforms
+import torchvision.transforms.functional as TF
+from celle.utils import replace_outliers
+def simple_conversion(seq):
+    """Create 26-dim embedding"""
+    chars = [
+        "-",
+        "M",
+        "R",
+        "H",
+        "K",
+        "D",
+        "E",
+        "S",
+        "T",
+        "N",
+        "Q",
+        "C",
+        "U",
+        "G",
+        "P",
+        "A",
+        "V",
+        "I",
+        "F",
+        "Y",
+        "W",
+        "L",
+        "O",
+        "X",
+        "Z",
+        "B",
+        "J",
+    ]
+    nums = range(len(chars))
+    seqs_x = np.zeros(len(seq))
+    for idx, char in enumerate(seq):
+        lui = chars.index(char)
+        seqs_x[idx] = nums[lui]
+    return torch.tensor([seqs_x]).long()
+class CellLoader(Dataset):
+    """imports mined opencell images with protein sequence"""
+    def __init__(
+        self,
+        data_csv=None,
+        dataset=None,
+        split_key=None,
+        resize=600,
+        crop_size=600,
+        crop_method="random",
+        sequence_mode="simple",
+        vocab="bert",
+        threshold="median",
+        text_seq_len=0,
+        pad_mode="random",
+    ):
+        self.data_csv = data_csv
+        self.dataset = dataset
+        self.image_folders = []
+        self.crop_method = crop_method
+        self.resize = resize
+        self.crop_size = crop_size
+        self.sequence_mode = sequence_mode
+        self.threshold = threshold
+        self.text_seq_len = int(text_seq_len)
+        self.vocab = vocab
+        self.pad_mode = pad_mode
+        if self.sequence_mode == "embedding" or self.sequence_mode == "onehot":
+            if self.vocab == "esm1b" or self.vocab == "esm2":
+                from esm import Alphabet
+                self.tokenizer = Alphabet.from_architecture(
+                    "ESM-1b"
+                ).get_batch_converter()
+                self.text_seq_len += 2
+        if data_csv:
+            data = pd.read_csv(data_csv)
+            self.parent_path = os.path.dirname(data_csv).split(data_csv)[0]
+            if split_key == "train":
+                self.data = data[data["split"] == "train"]
+            elif split_key == "val":
+                self.data = data[data["split"] == "val"]
+            else:
+                self.data = data
+            self.data = self.data.reset_index(drop=True)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(
+        self,
+        idx,
+        get_sequence=True,
+        get_images=True,
+    ):
+        if get_sequence and self.text_seq_len > 0:
+            protein_vector = self.get_protein_vector(idx)
+        else:
+            protein_vector = torch.zeros((1, 1))
+        if get_images:
+            nucleus, target, threshold = self.get_images(idx, self.dataset)
+        else:
+            nucleus, target, threshold = torch.zeros((3, 1))
+        data_dict = {
+            "nucleus": nucleus.float(),
+            "target": target.float(),
+            "threshold": threshold.float(),
+            "sequence": protein_vector.long(),
+        }
+        return data_dict
+    def get_protein_vector(self, idx):
+        if "protein_sequence" not in self.data.columns:
+            metadata = self.retrieve_metadata(idx)
+            protein_sequence = metadata["sequence"]
+        else:
+            protein_sequence = self.data.iloc[idx]["protein_sequence"]
+        protein_vector = self.tokenize_sequence(protein_sequence)
+        return protein_vector
+    def get_images(self, idx, dataset):
+        if dataset == "HPA":
+            nucleus = Image.open(
+                os.path.join(
+                    self.parent_path, self.data.iloc[idx]["nucleus_image_path"]
+                )
+            )
+            target = Image.open(
+                os.path.join(self.parent_path, self.data.iloc[idx]["target_image_path"])
+            )
+            nucleus = TF.to_tensor(nucleus)[0]
+            target = TF.to_tensor(target)[0]
+            image = torch.stack([nucleus, target], axis=0)
+            normalize = (0.0655, 0.0650), (0.1732, 0.1208)
+        elif dataset == "OpenCell":
+            image = Image.open(
+                os.path.join(self.parent_path, self.data.iloc[idx]["image_path"])
+            )
+            nucleus, target = [page.copy() for page in ImageSequence.Iterator(image)]
+            nucleus = replace_outliers(torch.divide(TF.to_tensor(nucleus), 65536))[0]
+            target = replace_outliers(torch.divide(TF.to_tensor(target), 65536))[0]
+            image = torch.stack([nucleus, target], axis=0)
+            normalize = (
+                (0.0272, 0.0244),
+                (0.0486, 0.0671),
+            )
+        # # from https://discuss.pytorch.org/t/how-to-apply-same-transform-on-a-pair-of-picture/14914
+        t_forms = [transforms.Resize(self.resize, antialias=None)]
+        if self.crop_method == "random":
+            t_forms.append(transforms.RandomCrop(self.crop_size))
+            t_forms.append(transforms.RandomHorizontalFlip(p=0.5))
+            t_forms.append(transforms.RandomVerticalFlip(p=0.5))
+        elif self.crop_method == "center":
+            t_forms.append(transforms.CenterCrop(self.crop_size))
+        t_forms.append(transforms.Normalize(normalize[0], normalize[1]))
+        image = transforms.Compose(t_forms)(image)
+        nucleus, target = image
+        nucleus /= torch.abs(nucleus).max()
+        target -= target.min()
+        target /= target.max()
+        nucleus = nucleus.unsqueeze(0)
+        target = target.unsqueeze(0)
+        threshold = target
+        if self.threshold == "mean":
+            threshold = 1.0 * (threshold > (torch.mean(threshold)))
+        elif self.threshold == "median":
+            threshold = 1.0 * (threshold > (torch.median(threshold)))
+        elif self.threshold == "1090_IQR":
+            p10 = torch.quantile(threshold, 0.1, None)
+            p90 = torch.quantile(threshold, 0.9, None)
+            threshold = torch.clip(threshold, p10, p90)
+        nucleus = torch.nan_to_num(nucleus, 0.0, 1.0, 0.0)
+        target = torch.nan_to_num(target, 0.0, 1.0, 0.0)
+        threshold = torch.nan_to_num(threshold, 0.0, 1.0, 0.0)
+        return nucleus, target, threshold
+    def retrieve_metadata(self, idx):
+        with open(
+            os.path.join(self.parent_path, self.data.iloc[idx]["metadata_path"])
+        ) as f:
+            metadata = json.load(f)
+        return metadata
+    def tokenize_sequence(self, protein_sequence):
+        pad_token = 0
+        if self.sequence_mode == "simple":
+            protein_vector = simple_conversion(protein_sequence)
+        elif self.sequence_mode == "center":
+            protein_sequence = protein_sequence.center(self.text_seq_length, "-")
+            protein_vector = simple_conversion(protein_sequence)
+        elif self.sequence_mode == "alternating":
+            protein_sequence = protein_sequence.center(self.text_seq_length, "-")
+            protein_sequence = protein_sequence[::18]
+            protein_sequence = protein_sequence.center(
+                int(self.text_seq_length / 18) + 1, "-"
+            )
+            protein_vector = simple_conversion(protein_sequence)
+        elif self.sequence_mode == "embedding":
+            if self.vocab == "esm1b" or self.vocab == "esm2":
+                pad_token = 1
+                protein_vector = self.tokenizer([("", protein_sequence)])[-1]
+        if protein_vector.shape[-1] < self.text_seq_len:
+            diff = self.text_seq_len - protein_vector.shape[-1]
+            if self.pad_mode == "end":
+                protein_vector = torch.nn.functional.pad(
+                    protein_vector, (0, diff), "constant", pad_token
+                )
+            elif self.pad_mode == "random":
+                split = diff - np.random.randint(0, diff + 1)
+                protein_vector = torch.cat(
+                    [torch.ones(1, split) * 0, protein_vector], dim=1
+                )
+                protein_vector = torch.nn.functional.pad(
+                    protein_vector, (0, diff - split), "constant", pad_token
+                )
+        elif protein_vector.shape[-1] > self.text_seq_len:
+            start_int = np.random.randint(
+                0, protein_vector.shape[-1] - self.text_seq_len
+            )
+            protein_vector = protein_vector[
+                :, start_int : start_int + self.text_seq_len
+            ]
+        return protein_vector.long()

images/Armadillo repeat-containing X-linked protein 5 nucleus.jpg ADDED Viewed

images/Armadillo repeat-containing X-linked protein 5 protein.jpg ADDED Viewed

prediction.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import os
+os.chdir('..')
+from dataloader import CellLoader
+from celle_main import instantiate_from_config
+from omegaconf import OmegaConf
+def run_sequence_prediction(
+    sequence_input,
+    nucleus_image,
+    protein_image,
+    model_ckpt_path,
+    model_config_path,
+    device
+):
+    """
+    Run Celle model with provided inputs and display results.
+    :param sequence: Path to sequence file
+    :param nucleus_image_path: Path to nucleus image
+    :param protein_image_path: Path to protein image (optional)
+    :param model_ckpt_path: Path to model checkpoint
+    :param model_config_path: Path to model config
+    """
+    # Instantiate dataset object
+    dataset = CellLoader(
+        sequence_mode="embedding",
+        vocab="esm2",
+        split_key="val",
+        crop_method="center",
+        resize=600,
+        crop_size=256,
+        text_seq_len=1000,
+        pad_mode="end",
+        threshold="median",
+    )
+    # Check if sequence is provided and valid
+    if len(sequence_input) == 0:
+        raise ValueError("Sequence must be provided.")
+    if "<mask>" not in sequence_input:
+        print("Warning: Sequence does not contain any masked positions to predict.")
+    # Convert SEQUENCE to sequence using dataset.tokenize_sequence()
+    sequence = dataset.tokenize_sequence(sequence_input)
+    # Load model config and set ckpt_path if not provided in config
+    config = OmegaConf.load(model_config_path)
+    if config["model"]["params"]["ckpt_path"] is None:
+        config["model"]["params"]["ckpt_path"] = model_ckpt_path
+    # Set condition_model_path and vqgan_model_path to None
+    config["model"]["params"]["condition_model_path"] = None
+    config["model"]["params"]["vqgan_model_path"] = None
+    # Instantiate model from config and move to device
+    model = instantiate_from_config(config).to(device)
+    # Sample from model using provided sequence and nucleus image
+    _, predicted_sequence, _ = model.celle.sample_text(
+        text=sequence,
+        condition=nucleus_image,
+        image=protein_image,
+        force_aas=True,
+        timesteps=1,
+        temperature=1,
+        progress=True,
+    )
+    formatted_predicted_sequence = ""
+    for i in range(min(len(predicted_sequence), len(sequence))):
+        if predicted_sequence[i] != sequence[i]:
+            formatted_predicted_sequence += f"**{predicted_sequence[i]}**"
+        else:
+            formatted_predicted_sequence += predicted_sequence[i]
+    if len(predicted_sequence) > len(sequence):
+        formatted_predicted_sequence += f"**{predicted_sequence[len(sequence):]}**"
+    return formatted_predicted_sequence

taming/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import numpy as np
+class LambdaWarmUpCosineScheduler:
+    """
+    note: use with a base_lr of 1.0
+    """
+    def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
+        self.lr_warm_up_steps = warm_up_steps
+        self.lr_start = lr_start
+        self.lr_min = lr_min
+        self.lr_max = lr_max
+        self.lr_max_decay_steps = max_decay_steps
+        self.last_lr = 0.
+        self.verbosity_interval = verbosity_interval
+    def schedule(self, n):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+        if n < self.lr_warm_up_steps:
+            lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
+            self.last_lr = lr
+            return lr
+        else:
+            t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
+            t = min(t, 1.0)
+            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
+                    1 + np.cos(t * np.pi))
+            self.last_lr = lr
+            return lr
+    def __call__(self, n):
+        return self.schedule(n)

taming/models/cond_transformer.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import os, math
+import torch
+import torch.nn.functional as F
+import pytorch_lightning as pl
+from main import instantiate_from_config
+from taming.modules.util import SOSProvider
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class Net2NetTransformer(pl.LightningModule):
+    def __init__(self,
+                 transformer_config,
+                 first_stage_config,
+                 cond_stage_config,
+                 permuter_config=None,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 first_stage_key="image",
+                 cond_stage_key="depth",
+                 downsample_cond_size=-1,
+                 pkeep=1.0,
+                 sos_token=0,
+                 unconditional=False,
+                 ):
+        super().__init__()
+        self.be_unconditional = unconditional
+        self.sos_token = sos_token
+        self.first_stage_key = first_stage_key
+        self.cond_stage_key = cond_stage_key
+        self.init_first_stage_from_ckpt(first_stage_config)
+        self.init_cond_stage_from_ckpt(cond_stage_config)
+        if permuter_config is None:
+            permuter_config = {"target": "taming.modules.transformer.permuter.Identity"}
+        self.permuter = instantiate_from_config(config=permuter_config)
+        self.transformer = instantiate_from_config(config=transformer_config)
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.downsample_cond_size = downsample_cond_size
+        self.pkeep = pkeep
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        for k in sd.keys():
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    self.print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def init_first_stage_from_ckpt(self, config):
+        model = instantiate_from_config(config)
+        model = model.eval()
+        model.train = disabled_train
+        self.first_stage_model = model
+    def init_cond_stage_from_ckpt(self, config):
+        if config == "__is_first_stage__":
+            print("Using first stage also as cond stage.")
+            self.cond_stage_model = self.first_stage_model
+        elif config == "__is_unconditional__" or self.be_unconditional:
+            print(f"Using no cond stage. Assuming the training is intended to be unconditional. "
+                  f"Prepending {self.sos_token} as a sos token.")
+            self.be_unconditional = True
+            self.cond_stage_key = self.first_stage_key
+            self.cond_stage_model = SOSProvider(self.sos_token)
+        else:
+            model = instantiate_from_config(config)
+            model = model.eval()
+            model.train = disabled_train
+            self.cond_stage_model = model
+    def forward(self, x, c):
+        # one step to produce the logits
+        # x = target
+        # c = nucleus
+        _, z_indices = self.encode_to_z(x)
+        _, c_indices = self.encode_to_c(c)
+        if self.training and self.pkeep < 1.0:
+            mask = torch.bernoulli(self.pkeep*torch.ones(z_indices.shape,
+                                                         device=z_indices.device))
+            mask = mask.round().to(dtype=torch.int64)
+            r_indices = torch.randint_like(z_indices, self.transformer.config.vocab_size)
+            a_indices = mask*z_indices+(1-mask)*r_indices
+        else:
+            a_indices = z_indices
+        cz_indices = torch.cat((c_indices, a_indices), dim=1)
+        # target includes all sequence elements (no need to handle first one
+        # differently because we are conditioning)
+        target = z_indices
+        # make the prediction
+        logits, _ = self.transformer(cz_indices[:, :-1])
+        # cut off conditioning outputs - output i corresponds to p(z_i | z_{<i}, c)
+        logits = logits[:, c_indices.shape[1]-1:]
+        return logits, target
+    def top_k_logits(self, logits, k):
+        v, ix = torch.topk(logits, k)
+        out = logits.clone()
+        out[out < v[..., [-1]]] = -float('Inf')
+        return out
+    @torch.no_grad()
+    def sample(self, x, c, steps, temperature=1.0, sample=False, top_k=None,
+               callback=lambda k: None):
+        x = torch.cat((c,x),dim=1)
+        block_size = self.transformer.get_block_size()
+        assert not self.transformer.training
+        if self.pkeep <= 0.0:
+            # one pass suffices since input is pure noise anyway
+            assert len(x.shape)==2
+            noise_shape = (x.shape[0], steps-1)
+            #noise = torch.randint(self.transformer.config.vocab_size, noise_shape).to(x)
+            noise = c.clone()[:,x.shape[1]-c.shape[1]:-1]
+            x = torch.cat((x,noise),dim=1)
+            logits, _ = self.transformer(x)
+            # take all logits for now and scale by temp
+            logits = logits / temperature
+            # optionally crop probabilities to only the top k options
+            if top_k is not None:
+                logits = self.top_k_logits(logits, top_k)
+            # apply softmax to convert to probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution or take the most likely
+            if sample:
+                shape = probs.shape
+                probs = probs.reshape(shape[0]*shape[1],shape[2])
+                ix = torch.multinomial(probs, num_samples=1)
+                probs = probs.reshape(shape[0],shape[1],shape[2])
+                ix = ix.reshape(shape[0],shape[1])
+            else:
+                _, ix = torch.topk(probs, k=1, dim=-1)
+            # cut off conditioning
+            x = ix[:, c.shape[1]-1:]
+        else:
+            for k in range(steps):
+                callback(k)
+                assert x.size(1) <= block_size # make sure model can see conditioning
+                x_cond = x if x.size(1) <= block_size else x[:, -block_size:]  # crop context if needed
+                logits, _ = self.transformer(x_cond)
+                # pluck the logits at the final step and scale by temperature
+                logits = logits[:, -1, :] / temperature
+                # optionally crop probabilities to only the top k options
+                if top_k is not None:
+                    logits = self.top_k_logits(logits, top_k)
+                # apply softmax to convert to probabilities
+                probs = F.softmax(logits, dim=-1)
+                # sample from the distribution or take the most likely
+                if sample:
+                    ix = torch.multinomial(probs, num_samples=1)
+                else:
+                    _, ix = torch.topk(probs, k=1, dim=-1)
+                # append to the sequence and continue
+                x = torch.cat((x, ix), dim=1)
+            # cut off conditioning
+            x = x[:, c.shape[1]:]
+        return x
+    @torch.no_grad()
+    def encode_to_z(self, x):
+        quant_z, _, info = self.first_stage_model.encode(x)
+        indices = info[2].view(quant_z.shape[0], -1)
+        indices = self.permuter(indices)
+        return quant_z, indices
+    @torch.no_grad()
+    def encode_to_c(self, c):
+        if self.downsample_cond_size > -1:
+            c = F.interpolate(c, size=(self.downsample_cond_size, self.downsample_cond_size))
+        #quant_c, _, info = self.cond_stage_model.encode(x)
+        #indices = info[2].view(quant_c.shape[0], -1)
+        #indices = self.permuter(indices)
+        quant_c, _, [_,_,indices] = self.cond_stage_model.encode(c)
+        if len(indices.shape) != 2:
+            indices = indices.view(c.shape[0], -1)
+        return quant_c, indices
+    @torch.no_grad()
+    def decode_to_img(self, index, zshape):
+        index = self.permuter(index, reverse=True)
+        bhwc = (zshape[0],zshape[2],zshape[3],zshape[1])
+        quant_z = self.first_stage_model.quantize.get_codebook_entry(
+            index.reshape(-1), shape=bhwc)
+        x = self.first_stage_model.decode(quant_z)
+        return x
+    @torch.no_grad()
+    def log_images(self, batch, temperature=None, top_k=None, callback=None, lr_interface=False, **kwargs):
+        log = dict()
+        N = 4
+        if lr_interface:
+            x, c = self.get_xc(batch, N, diffuse=False, upsample_factor=8)
+        else:
+            x, c = self.get_xc(batch, N)
+        x = x.to(device=self.device)
+        c = c.to(device=self.device)
+        quant_z, z_indices = self.encode_to_z(x)
+        quant_c, c_indices = self.encode_to_c(c)
+        # create a "half"" sample
+        z_start_indices = z_indices[:,:z_indices.shape[1]//2]
+        index_sample = self.sample(z_start_indices, c_indices,
+                                   steps=z_indices.shape[1]-z_start_indices.shape[1],
+                                   temperature=temperature if temperature is not None else 1.0,
+                                   sample=True,
+                                   top_k=top_k if top_k is not None else 100,
+                                   callback=callback if callback is not None else lambda k: None)
+        x_sample = self.decode_to_img(index_sample, quant_z.shape)
+        # sample
+        z_start_indices = z_indices[:, :0]
+        index_sample = self.sample(z_start_indices, c_indices,
+                                   steps=z_indices.shape[1],
+                                   temperature=temperature if temperature is not None else 1.0,
+                                   sample=True,
+                                   top_k=top_k if top_k is not None else 100,
+                                   callback=callback if callback is not None else lambda k: None)
+        x_sample_nopix = self.decode_to_img(index_sample, quant_z.shape)
+        # det sample
+        z_start_indices = z_indices[:, :0]
+        index_sample = self.sample(z_start_indices, c_indices,
+                                   steps=z_indices.shape[1],
+                                   sample=False,
+                                   callback=callback if callback is not None else lambda k: None)
+        x_sample_det = self.decode_to_img(index_sample, quant_z.shape)
+        # reconstruction
+        x_rec = self.decode_to_img(z_indices, quant_z.shape)
+        log["inputs"] = x
+        log["reconstructions"] = x_rec
+        if self.cond_stage_key != "image" or self.cond_stage_key != "nucleus" or self.cond_stage_key != "target":
+            cond_rec = self.cond_stage_model.decode(quant_c)
+            if self.cond_stage_key == "segmentation":
+                # get image from segmentation mask
+                num_classes = cond_rec.shape[1]
+                c = torch.argmax(c, dim=1, keepdim=True)
+                c = F.one_hot(c, num_classes=num_classes)
+                c = c.squeeze(1).permute(0, 3, 1, 2).float()
+                c = self.cond_stage_model.to_rgb(c)
+                cond_rec = torch.argmax(cond_rec, dim=1, keepdim=True)
+                cond_rec = F.one_hot(cond_rec, num_classes=num_classes)
+                cond_rec = cond_rec.squeeze(1).permute(0, 3, 1, 2).float()
+                cond_rec = self.cond_stage_model.to_rgb(cond_rec)
+            log["conditioning_rec"] = cond_rec
+            log["conditioning"] = c
+        log["samples_half"] = x_sample
+        log["samples_nopix"] = x_sample_nopix
+        log["samples_det"] = x_sample_det
+        return log
+    def get_input(self, key, batch):
+        x = batch[key]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        #if len(x.shape) == 4:
+        #    x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
+        if x.dtype == torch.double:
+            x = x.float()
+        return x
+    def get_xc(self, batch, N=None):
+        x = self.get_input(self.first_stage_key, batch)
+        c = self.get_input(self.cond_stage_key, batch)
+        if N is not None:
+            x = x[:N]
+            c = c[:N]
+        return x, c
+    def shared_step(self, batch):
+        x, c = self.get_xc(batch)
+        logits, target = self(x, c)
+        loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), target.reshape(-1))
+        return loss
+    def training_step(self, batch, batch_idx):
+        loss = self.shared_step(batch)
+        self.log("train/loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss = self.shared_step(batch)
+        self.log("val/loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+        return loss
+    def configure_optimizers(self):
+        """
+        Following minGPT:
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.transformer.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                if pn.endswith('bias'):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+        # special case the position embedding parameter in the root GPT module as not decayed
+        no_decay.add('pos_emb')
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.transformer.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    % (str(param_dict.keys() - union_params), )
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": 0.01},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+        ]
+        optimizer = torch.optim.AdamW(optim_groups, lr=self.learning_rate, betas=(0.9, 0.95))
+        return optimizer

taming/models/dummy_cond_stage.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from torch import Tensor
+class DummyCondStage:
+    def __init__(self, conditional_key):
+        self.conditional_key = conditional_key
+        self.train = None
+    def eval(self):
+        return self
+    @staticmethod
+    def encode(c: Tensor):
+        return c, None, (None, None, c)
+    @staticmethod
+    def decode(c: Tensor):
+        return c
+    @staticmethod
+    def to_rgb(c: Tensor):
+        return c

taming/models/vqgan.py ADDED Viewed

	@@ -0,0 +1,649 @@

+import torch
+import torch.nn.functional as F
+import pytorch_lightning as pl
+from celle_taming_main import instantiate_from_config
+from taming.modules.diffusionmodules.model import Encoder, Decoder
+from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
+from taming.modules.vqvae.quantize import GumbelQuantize
+from taming.modules.vqvae.quantize import EMAVectorQuantizer
+class VQModel(pl.LightningModule):
+    def __init__(
+        self,
+        ddconfig,
+        lossconfig,
+        n_embed,
+        embed_dim,
+        ckpt_path=None,
+        ignore_keys=[],
+        image_key="image",
+        colorize_nlabels=None,
+        monitor=None,
+        remap=None,
+        sane_index_shape=False,  # tell vector quantizer to return indices as bhw
+    ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        self.quantize = VectorQuantizer(
+            n_embed,
+            embed_dim,
+            beta=0.25,
+            remap=remap,
+            sane_index_shape=sane_index_shape,
+        )
+        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.image_key = image_key
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels) == int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code_b):
+        quant_b = self.quantize.embed_code(code_b)
+        dec = self.decode(quant_b)
+        return dec
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff
+    def get_input(self, batch, k):
+        if k == "mixed":
+            keys = ["nucleus", "target"]
+            index = torch.randint(low=0, high=2, size=(1,), dtype=int).item()
+            k = keys[index]
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        # x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
+        return x
+    def training_step(self, batch, batch_idx=None, optimizer_idx=0):
+        if type(batch) == dict:
+            x = self.get_input(batch, self.image_key)
+        else:
+            x = batch
+        xrec, qloss = self(
+            x,
+        )
+        if optimizer_idx == 0:
+            # autoencode
+            aeloss, log_dict_ae = self.loss(
+                qloss,
+                x,
+                xrec,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split="train",
+            )
+            self.log(
+                "train/aeloss",
+                aeloss,
+                prog_bar=True,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=True,
+            )
+            self.log_dict(
+                log_dict_ae,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=True,
+            )
+            return aeloss
+        if optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(
+                qloss,
+                x,
+                xrec,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split="train",
+            )
+            self.log(
+                "train/discloss",
+                discloss,
+                prog_bar=True,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=True,
+            )
+            self.log_dict(
+                log_dict_disc,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=True,
+            )
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        if type(batch) == dict:
+            x = self.get_input(batch, self.image_key)
+        else:
+            x = batch
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(
+            qloss,
+            x,
+            xrec,
+            0,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split="val",
+        )
+        discloss, log_dict_disc = self.loss(
+            qloss,
+            x,
+            xrec,
+            1,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split="val",
+        )
+        # rec_loss = log_dict_ae["val/rec_loss"]
+        # self.log(
+        #     "val/rec_loss",
+        #     rec_loss,
+        #     prog_bar=True,
+        #     logger=True,
+        #     on_step=True,
+        #     on_epoch=True,
+        #     sync_dist=True,
+        # )
+        # self.log(
+        #     "val/aeloss",
+        #     aeloss,
+        #     prog_bar=True,
+        #     logger=True,
+        #     on_step=True,
+        #     on_epoch=True,
+        #     sync_dist=True,
+        # )
+        for key, value in log_dict_disc.items():
+            if key in log_dict_ae:
+                log_dict_ae[key].extend(value)
+            else:
+                log_dict_ae[key] = value
+        self.log_dict(log_dict_ae, sync_dist=True)
+        return self.log_dict
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(
+            list(self.encoder.parameters())
+            + list(self.decoder.parameters())
+            + list(self.quantize.parameters())
+            + list(self.quant_conv.parameters())
+            + list(self.post_quant_conv.parameters()),
+            lr=lr,
+            betas=(0.5, 0.9),
+        )
+        opt_disc = torch.optim.Adam(
+            self.loss.discriminator.parameters(), lr=lr, betas=(0.5, 0.9)
+        )
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    def log_images(self, batch, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.0 * (x - x.min()) / (x.max() - x.min()) - 1.0
+        return x
+class VQSegmentationModel(VQModel):
+    def __init__(self, n_labels, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.register_buffer("colorize", torch.randn(3, n_labels, 1, 1))
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(
+            list(self.encoder.parameters())
+            + list(self.decoder.parameters())
+            + list(self.quantize.parameters())
+            + list(self.quant_conv.parameters())
+            + list(self.post_quant_conv.parameters()),
+            lr=lr,
+            betas=(0.5, 0.9),
+        )
+        return opt_ae
+    def training_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, split="train")
+        self.log_dict(
+            log_dict_ae,
+            prog_bar=False,
+            logger=True,
+            on_step=True,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        return aeloss
+    def validation_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, split="val")
+        self.log_dict(
+            log_dict_ae,
+            prog_bar=False,
+            logger=True,
+            on_step=True,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        total_loss = log_dict_ae["val/total_loss"]
+        self.log(
+            "val/total_loss",
+            total_loss,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        return aeloss
+    @torch.no_grad()
+    def log_images(self, batch, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            # convert logits to indices
+            xrec = torch.argmax(xrec, dim=1, keepdim=True)
+            xrec = F.one_hot(xrec, num_classes=x.shape[1])
+            xrec = xrec.squeeze(1).permute(0, 3, 1, 2).float()
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        return log
+class VQNoDiscModel(VQModel):
+    def __init__(
+        self,
+        ddconfig,
+        lossconfig,
+        n_embed,
+        embed_dim,
+        ckpt_path=None,
+        ignore_keys=[],
+        image_key="image",
+        colorize_nlabels=None,
+    ):
+        super().__init__(
+            ddconfig=ddconfig,
+            lossconfig=lossconfig,
+            n_embed=n_embed,
+            embed_dim=embed_dim,
+            ckpt_path=ckpt_path,
+            ignore_keys=ignore_keys,
+            image_key=image_key,
+            colorize_nlabels=colorize_nlabels,
+        )
+    def training_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        # autoencode
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, self.global_step, split="train")
+        output = pl.TrainResult(minimize=aeloss)
+        output.log(
+            "train/aeloss",
+            aeloss,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=True,
+        )
+        output.log_dict(
+            log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True
+        )
+        return output
+    def validation_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, self.global_step, split="val")
+        rec_loss = log_dict_ae["val/rec_loss"]
+        output = pl.EvalResult(checkpoint_on=rec_loss)
+        output.log(
+            "val/rec_loss",
+            rec_loss,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=True,
+        )
+        output.log(
+            "val/aeloss",
+            aeloss,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=True,
+        )
+        output.log_dict(log_dict_ae)
+        return output
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(
+            list(self.encoder.parameters())
+            + list(self.decoder.parameters())
+            + list(self.quantize.parameters())
+            + list(self.quant_conv.parameters())
+            + list(self.post_quant_conv.parameters()),
+            lr=self.learning_rate,
+            betas=(0.5, 0.9),
+        )
+        return optimizer
+class GumbelVQ(VQModel):
+    def __init__(
+        self,
+        ddconfig,
+        lossconfig,
+        n_embed,
+        embed_dim,
+        temperature_scheduler_config,
+        ckpt_path=None,
+        ignore_keys=[],
+        image_key="image",
+        colorize_nlabels=None,
+        monitor=None,
+        kl_weight=1e-8,
+        remap=None,
+    ):
+        z_channels = ddconfig["z_channels"]
+        super().__init__(
+            ddconfig,
+            lossconfig,
+            n_embed,
+            embed_dim,
+            ckpt_path=None,
+            ignore_keys=ignore_keys,
+            image_key=image_key,
+            colorize_nlabels=colorize_nlabels,
+            monitor=monitor,
+        )
+        self.loss.n_classes = n_embed
+        self.vocab_size = n_embed
+        self.quantize = GumbelQuantize(
+            z_channels,
+            embed_dim,
+            n_embed=n_embed,
+            kl_weight=kl_weight,
+            temp_init=1.0,
+            remap=remap,
+        )
+        self.temperature_scheduler = instantiate_from_config(
+            temperature_scheduler_config
+        )  # annealing of temp
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def temperature_scheduling(self):
+        self.quantize.temperature = self.temperature_scheduler(self.global_step)
+    def encode_to_prequant(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+    def decode_code(self, code_b):
+        raise NotImplementedError
+    def training_step(self, batch, batch_idx=None, optimizer_idx=0):
+        self.temperature_scheduling()
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        if optimizer_idx == 0:
+            # autoencode
+            aeloss, log_dict_ae = self.loss(
+                qloss,
+                x,
+                xrec,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split="train",
+            )
+            self.log_dict(
+                log_dict_ae,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=True,
+            )
+            self.log(
+                "temperature",
+                self.quantize.temperature,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=True,
+            )
+            return aeloss
+        if optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(
+                qloss,
+                x,
+                xrec,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split="train",
+            )
+            self.log_dict(
+                log_dict_disc,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+                sync_dist=True,
+            )
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss = self(x)
+        aeloss, log_dict_ae = self.loss(
+            qloss,
+            x,
+            xrec,
+            0,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split="val",
+        )
+        discloss, log_dict_disc = self.loss(
+            qloss,
+            x,
+            xrec,
+            1,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split="val",
+        )
+        rec_loss = log_dict_ae["val/rec_loss"]
+        self.log(
+            "val/rec_loss",
+            rec_loss,
+            prog_bar=True,
+            logger=True,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        self.log(
+            "val/aeloss",
+            aeloss,
+            prog_bar=True,
+            logger=True,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        self.log_dict(log_dict_ae, sync_dist=True)
+        self.log_dict(log_dict_disc, sync_dist=True)
+        return self.log_dict
+    def log_images(self, batch, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        # encode
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, _, _ = self.quantize(h)
+        # decode
+        x_rec = self.decode(quant)
+        log["inputs"] = x
+        log["reconstructions"] = x_rec
+        return log
+class EMAVQ(VQModel):
+    def __init__(
+        self,
+        ddconfig,
+        lossconfig,
+        n_embed,
+        embed_dim,
+        ckpt_path=None,
+        ignore_keys=[],
+        image_key="image",
+        colorize_nlabels=None,
+        monitor=None,
+        remap=None,
+        sane_index_shape=False,  # tell vector quantizer to return indices as bhw
+    ):
+        super().__init__(
+            ddconfig,
+            lossconfig,
+            n_embed,
+            embed_dim,
+            ckpt_path=None,
+            ignore_keys=ignore_keys,
+            image_key=image_key,
+            colorize_nlabels=colorize_nlabels,
+            monitor=monitor,
+        )
+        self.quantize = EMAVectorQuantizer(
+            n_embed=n_embed, embedding_dim=embed_dim, beta=0.25, remap=remap
+        )
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        # Remove self.quantize from parameter list since it is updated via EMA
+        opt_ae = torch.optim.Adam(
+            list(self.encoder.parameters())
+            + list(self.decoder.parameters())
+            + list(self.quant_conv.parameters())
+            + list(self.post_quant_conv.parameters()),
+            lr=lr,
+            betas=(0.5, 0.9),
+        )
+        opt_disc = torch.optim.Adam(
+            self.loss.discriminator.parameters(), lr=lr, betas=(0.5, 0.9)
+        )
+        return [opt_ae, opt_disc], []

taming/modules/autoencoder/lpips/vgg.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a78928a0af1e5f0fcb1f3b9e8f8c3a2a5a3de244d830ad5c1feddc79b8432868
+size 7289

taming/modules/diffusionmodules/model.py ADDED Viewed

	@@ -0,0 +1,776 @@

+# pytorch_diffusion + derived encoder decoder
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+def get_timestep_embedding(timesteps, embedding_dim):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    From Fairseq.
+    Build sinusoidal embeddings.
+    This matches the implementation in tensor2tensor, but differs slightly
+    from the description in Section 3.5 of "Attention Is All You Need".
+    """
+    assert len(timesteps.shape) == 1
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+    emb = emb.to(device=timesteps.device)
+    emb = timesteps.float()[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0,1,0,0))
+    return emb
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+class Model(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, use_timestep=True):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = self.ch*4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList([
+                torch.nn.Linear(self.ch,
+                                self.temb_ch),
+                torch.nn.Linear(self.temb_ch,
+                                self.temb_ch),
+            ])
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            skip_in = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch*in_ch_mult[i_level]
+                block.append(ResnetBlock(in_channels=block_in+skip_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x, t=None):
+        #assert x.shape[2] == x.shape[3] == self.resolution
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](
+                    torch.cat([h, hs.pop()], dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, **ignore_kwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        #assert x.shape[2] == x.shape[3] == self.resolution, "{}, {}, {}".format(x.shape[2], x.shape[3], self.resolution)
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, **ignorekwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(ch_mult)
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        self.z_shape = (1,z_channels,curr_res,curr_res)
+        print("Working with z of shape {} = {} dimensions.".format(
+            self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class VUNet(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True,
+                 in_channels, c_channels,
+                 resolution, z_channels, use_timestep=False, **ignore_kwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = self.ch*4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList([
+                torch.nn.Linear(self.ch,
+                                self.temb_ch),
+                torch.nn.Linear(self.temb_ch,
+                                self.temb_ch),
+            ])
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(c_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        self.z_in = torch.nn.Conv2d(z_channels,
+                                    block_in,
+                                    kernel_size=1,
+                                    stride=1,
+                                    padding=0)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=2*block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            skip_in = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch*in_ch_mult[i_level]
+                block.append(ResnetBlock(in_channels=block_in+skip_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x, z):
+        #assert x.shape[2] == x.shape[3] == self.resolution
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        z = self.z_in(z)
+        h = torch.cat((h,z),dim=1)
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](
+                    torch.cat([h, hs.pop()], dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class SimpleDecoder(nn.Module):
+    def __init__(self, in_channels, out_channels, *args, **kwargs):
+        super().__init__()
+        self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
+                                     ResnetBlock(in_channels=in_channels,
+                                                 out_channels=2 * in_channels,
+                                                 temb_channels=0, dropout=0.0),
+                                     ResnetBlock(in_channels=2 * in_channels,
+                                                out_channels=4 * in_channels,
+                                                temb_channels=0, dropout=0.0),
+                                     ResnetBlock(in_channels=4 * in_channels,
+                                                out_channels=2 * in_channels,
+                                                temb_channels=0, dropout=0.0),
+                                     nn.Conv2d(2*in_channels, in_channels, 1),
+                                     Upsample(in_channels, with_conv=True)])
+        # end
+        self.norm_out = Normalize(in_channels)
+        self.conv_out = torch.nn.Conv2d(in_channels,
+                                        out_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        for i, layer in enumerate(self.model):
+            if i in [1,2,3]:
+                x = layer(x, None)
+            else:
+                x = layer(x)
+        h = self.norm_out(x)
+        h = nonlinearity(h)
+        x = self.conv_out(h)
+        return x
+class UpsampleDecoder(nn.Module):
+    def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
+                 ch_mult=(2,2), dropout=0.0):
+        super().__init__()
+        # upsampling
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = in_channels
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.res_blocks = nn.ModuleList()
+        self.upsample_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            res_block = []
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                res_block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+            self.res_blocks.append(nn.ModuleList(res_block))
+            if i_level != self.num_resolutions - 1:
+                self.upsample_blocks.append(Upsample(block_in, True))
+                curr_res = curr_res * 2
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        # upsampling
+        h = x
+        for k, i_level in enumerate(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.res_blocks[i_level][i_block](h, None)
+            if i_level != self.num_resolutions - 1:
+                h = self.upsample_blocks[k](h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h

taming/modules/discriminator/model.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import functools
+import torch.nn as nn
+from taming.modules.util import ActNorm
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        nn.init.normal_(m.weight.data, 0.0, 0.02)
+    elif classname.find('BatchNorm') != -1:
+        nn.init.normal_(m.weight.data, 1.0, 0.02)
+        nn.init.constant_(m.bias.data, 0)
+class NLayerDiscriminator(nn.Module):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+        --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+        """Construct a PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input images
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            norm_layer      -- normalization layer
+        """
+        super(NLayerDiscriminator, self).__init__()
+        if not use_actnorm:
+            norm_layer = nn.BatchNorm2d
+        else:
+            norm_layer = ActNorm
+        if type(norm_layer) == functools.partial:  # no need to use bias as BatchNorm2d has affine parameters
+            use_bias = norm_layer.func != nn.BatchNorm2d
+        else:
+            use_bias = norm_layer != nn.BatchNorm2d
+        kw = 4
+        padw = 1
+        sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2 ** n, 8)
+            sequence += [
+                nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, True)
+            ]
+        nf_mult_prev = nf_mult
+        nf_mult = min(2 ** n_layers, 8)
+        sequence += [
+            nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, True)
+        ]
+        sequence += [
+            nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)]  # output 1 channel prediction map
+        self.main = nn.Sequential(*sequence)
+    def forward(self, input):
+        """Standard forward."""
+        return self.main(input)

taming/modules/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from taming.modules.losses.vqperceptual import DummyLoss
2	+

taming/modules/losses/lpips.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models"""
+import torch
+import torch.nn as nn
+from torchvision import models
+from collections import namedtuple
+from taming.util import get_ckpt_path
+class LPIPS(nn.Module):
+    # Learned perceptual metric
+    def __init__(self, use_dropout=True):
+        super().__init__()
+        self.scaling_layer = ScalingLayer()
+        self.chns = [64, 128, 256, 512, 512]  # vg16 features
+        self.net = vgg16(pretrained=True, requires_grad=False)
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.load_from_pretrained()
+        for param in self.parameters():
+            param.requires_grad = False
+    def load_from_pretrained(self, name="vgg_lpips"):
+        ckpt = get_ckpt_path(name, "taming/modules/autoencoder/lpips")
+        self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
+        print("loaded pretrained LPIPS loss from {}".format(ckpt))
+    @classmethod
+    def from_pretrained(cls, name="vgg_lpips"):
+        if name != "vgg_lpips":
+            raise NotImplementedError
+        model = cls()
+        ckpt = get_ckpt_path(name)
+        model.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
+        return model
+    def forward(self, input, target):
+        in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
+        outs0, outs1 = self.net(in0_input), self.net(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+        lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        for kk in range(len(self.chns)):
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+        res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))]
+        val = res[0]
+        for l in range(1, len(self.chns)):
+            val += res[l]
+        return val
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        self.register_buffer('shift', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
+        self.register_buffer('scale', torch.Tensor([.458, .448, .450])[None, :, None, None])
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+class NetLinLayer(nn.Module):
+    """ A single linear layer which does a 1x1 conv """
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+        layers = [nn.Dropout(), ] if (use_dropout) else []
+        layers += [nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ]
+        self.model = nn.Sequential(*layers)
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = models.vgg16(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out
+def normalize_tensor(x,eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(x**2,dim=1,keepdim=True))
+    return x/(norm_factor+eps)
+def spatial_average(x, keepdim=True):
+    return x.mean([2,3],keepdim=keepdim)

taming/modules/losses/segmentation.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch.nn as nn
+import torch.nn.functional as F
+class BCELoss(nn.Module):
+    def forward(self, prediction, target):
+        loss = F.binary_cross_entropy_with_logits(prediction,target)
+        return loss, {}
+class BCELossWithQuant(nn.Module):
+    def __init__(self, codebook_weight=1.):
+        super().__init__()
+        self.codebook_weight = codebook_weight
+    def forward(self, qloss, target, prediction, split):
+        bce_loss = F.binary_cross_entropy_with_logits(prediction,target)
+        loss = bce_loss + self.codebook_weight*qloss
+        return loss, {"{}/total_loss".format(split): loss.clone().detach().mean(),
+                      "{}/bce_loss".format(split): bce_loss.detach().mean(),
+                      "{}/quant_loss".format(split): qloss.detach().mean()
+                      }

taming/modules/losses/vqperceptual.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from taming.modules.losses.lpips import LPIPS
+from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
+class DummyLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+def adopt_weight(weight, global_step, threshold=0, value=0.0):
+    if global_step < threshold:
+        weight = value
+    return weight
+def hinge_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean(F.relu(1.0 - logits_real))
+    loss_fake = torch.mean(F.relu(1.0 + logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+def vanilla_d_loss(logits_real, logits_fake):
+    d_loss = 0.5 * (
+        torch.mean(torch.nn.functional.softplus(-logits_real))
+        + torch.mean(torch.nn.functional.softplus(logits_fake))
+    )
+    return d_loss
+class VQLPIPSWithDiscriminator(nn.Module):
+    def __init__(
+        self,
+        disc_start,
+        codebook_weight=1.0,
+        pixelloss_weight=1.0,
+        disc_num_layers=3,
+        disc_in_channels=3,
+        disc_factor=1.0,
+        disc_weight=1.0,
+        perceptual_weight=1.0,
+        use_actnorm=False,
+        disc_conditional=False,
+        disc_ndf=64,
+        disc_loss="hinge",
+    ):
+        super().__init__()
+        assert disc_loss in ["hinge", "vanilla"]
+        self.codebook_weight = codebook_weight
+        self.pixel_weight = pixelloss_weight
+        self.perceptual_loss = LPIPS().eval()
+        self.perceptual_weight = perceptual_weight
+        self.discriminator = NLayerDiscriminator(
+            input_nc=disc_in_channels,
+            n_layers=disc_num_layers,
+            use_actnorm=use_actnorm,
+            ndf=disc_ndf,
+        ).apply(weights_init)
+        self.discriminator_iter_start = disc_start
+        if disc_loss == "hinge":
+            self.disc_loss = hinge_d_loss
+        elif disc_loss == "vanilla":
+            self.disc_loss = vanilla_d_loss
+        else:
+            raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
+        print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
+        self.disc_factor = disc_factor
+        self.discriminator_weight = disc_weight
+        self.disc_conditional = disc_conditional
+    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
+        if last_layer is not None:
+            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        else:
+            nll_grads = torch.autograd.grad(
+                nll_loss, self.last_layer[0], retain_graph=True
+            )[0]
+            g_grads = torch.autograd.grad(
+                g_loss, self.last_layer[0], retain_graph=True
+            )[0]
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+        d_weight = d_weight * self.discriminator_weight
+        return d_weight
+    def forward(
+        self,
+        codebook_loss,
+        inputs,
+        reconstructions,
+        optimizer_idx,
+        global_step,
+        last_layer=None,
+        cond=None,
+        split="train",
+    ):
+        rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
+        if self.perceptual_weight > 0:
+            p_loss = self.perceptual_loss(
+                inputs.contiguous(), reconstructions.contiguous()
+            )
+            rec_loss = rec_loss + self.perceptual_weight * p_loss
+        else:
+            p_loss = torch.tensor([0.0])
+        nll_loss = rec_loss
+        # nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+        nll_loss = torch.mean(nll_loss)
+        # now the GAN part
+        if optimizer_idx == 0:
+            # generator update
+            if cond is None:
+                assert not self.disc_conditional
+                logits_fake = self.discriminator(reconstructions.contiguous())
+            else:
+                assert self.disc_conditional
+                logits_fake = self.discriminator(
+                    torch.cat((reconstructions.contiguous(), cond), dim=1)
+                )
+            g_loss = -torch.mean(logits_fake)
+            try:
+                d_weight = self.calculate_adaptive_weight(
+                    nll_loss, g_loss, last_layer=last_layer
+                )
+            except RuntimeError:
+                assert not self.training
+                d_weight = torch.tensor(0.0)
+            disc_factor = adopt_weight(
+                self.disc_factor, global_step, threshold=self.discriminator_iter_start
+            )
+            loss = (
+                nll_loss
+                + d_weight * disc_factor * g_loss
+                + self.codebook_weight * codebook_loss.mean()
+            )
+            log = {
+                "{}/total_loss".format(split): loss.clone().detach().mean(),
+                "{}/quant_loss".format(split): codebook_loss.detach().mean(),
+                "{}/nll_loss".format(split): nll_loss.detach().mean(),
+                "{}/rec_loss".format(split): rec_loss.detach().mean(),
+                "{}/p_loss".format(split): p_loss.detach().mean(),
+                "{}/d_weight".format(split): d_weight.detach(),
+                "{}/disc_factor".format(split): torch.tensor(disc_factor),
+                "{}/g_loss".format(split): g_loss.detach().mean(),
+            }
+            return loss, log
+        if optimizer_idx == 1:
+            # second pass for discriminator update
+            if cond is None:
+                logits_real = self.discriminator(inputs.contiguous().detach())
+                logits_fake = self.discriminator(reconstructions.contiguous().detach())
+            else:
+                logits_real = self.discriminator(
+                    torch.cat((inputs.contiguous().detach(), cond), dim=1)
+                )
+                logits_fake = self.discriminator(
+                    torch.cat((reconstructions.contiguous().detach(), cond), dim=1)
+                )
+            disc_factor = adopt_weight(
+                self.disc_factor, global_step, threshold=self.discriminator_iter_start
+            )
+            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
+            log = {
+                "{}/disc_loss".format(split): d_loss.clone().detach().mean(),
+                "{}/logits_real".format(split): logits_real.detach().mean(),
+                "{}/logits_fake".format(split): logits_fake.detach().mean(),
+            }
+            return d_loss, log

taming/modules/misc/coord.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+class CoordStage(object):
+    def __init__(self, n_embed, down_factor):
+        self.n_embed = n_embed
+        self.down_factor = down_factor
+    def eval(self):
+        return self
+    def encode(self, c):
+        """fake vqmodel interface"""
+        assert 0.0 <= c.min() and c.max() <= 1.0
+        b,ch,h,w = c.shape
+        assert ch == 1
+        c = torch.nn.functional.interpolate(c, scale_factor=1/self.down_factor,
+                                            mode="area")
+        c = c.clamp(0.0, 1.0)
+        c = self.n_embed*c
+        c_quant = c.round()
+        c_ind = c_quant.to(dtype=torch.long)
+        info = None, None, c_ind
+        return c_quant, None, info
+    def decode(self, c):
+        c = c/self.n_embed
+        c = torch.nn.functional.interpolate(c, scale_factor=self.down_factor,
+                                            mode="nearest")
+        return c

taming/modules/transformer/mingpt.py ADDED Viewed

	@@ -0,0 +1,415 @@

+"""
+taken from: https://github.com/karpathy/minGPT/
+GPT model:
+- the initial stem consists of a combination of token encoding and a positional encoding
+- the meat of it is a uniform sequence of Transformer blocks
+    - each Transformer is a sequential combination of a 1-hidden-layer MLP block and a self-attention block
+    - all blocks feed into a central residual pathway similar to resnets
+- the final decoder is a linear projection into a vanilla Softmax classifier
+"""
+import math
+import logging
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from transformers import top_k_top_p_filtering
+logger = logging.getLogger(__name__)
+class GPTConfig:
+    """ base GPT config, params common to all GPT versions """
+    embd_pdrop = 0.1
+    resid_pdrop = 0.1
+    attn_pdrop = 0.1
+    def __init__(self, vocab_size, block_size, **kwargs):
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        for k,v in kwargs.items():
+            setattr(self, k, v)
+class GPT1Config(GPTConfig):
+    """ GPT-1 like network roughly 125M params """
+    n_layer = 12
+    n_head = 12
+    n_embd = 768
+class CausalSelfAttention(nn.Module):
+    """
+    A vanilla multi-head masked self-attention layer with a projection at the end.
+    It is possible to use torch.nn.MultiheadAttention here but I am including an
+    explicit implementation here to show that there is nothing too scary here.
+    """
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads
+        self.key = nn.Linear(config.n_embd, config.n_embd)
+        self.query = nn.Linear(config.n_embd, config.n_embd)
+        self.value = nn.Linear(config.n_embd, config.n_embd)
+        # regularization
+        self.attn_drop = nn.Dropout(config.attn_pdrop)
+        self.resid_drop = nn.Dropout(config.resid_pdrop)
+        # output projection
+        self.proj = nn.Linear(config.n_embd, config.n_embd)
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        mask = torch.tril(torch.ones(config.block_size,
+                                     config.block_size))
+        if hasattr(config, "n_unmasked"):
+            mask[:config.n_unmasked, :config.n_unmasked] = 1
+        self.register_buffer("mask", mask.view(1, 1, config.block_size, config.block_size))
+        self.n_head = config.n_head
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        present = torch.stack((k, v))
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            k = torch.cat((past_key, k), dim=-2)
+            v = torch.cat((past_value, v), dim=-2)
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        if layer_past is None:
+            att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.attn_drop(att)
+        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_drop(self.proj(y))
+        return y, present   # TODO: check that this does not break anything
+class Block(nn.Module):
+    """ an unassuming Transformer block """
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config.n_embd)
+        self.ln2 = nn.LayerNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config.n_embd, 4 * config.n_embd),
+            nn.GELU(),  # nice
+            nn.Linear(4 * config.n_embd, config.n_embd),
+            nn.Dropout(config.resid_pdrop),
+        )
+    def forward(self, x, layer_past=None, return_present=False):
+        # TODO: check that training still works
+        if return_present: assert not self.training
+        # layer past: tuple of length two with B, nh, T, hs
+        attn, present = self.attn(self.ln1(x), layer_past=layer_past)
+        x = x + attn
+        x = x + self.mlp(self.ln2(x))
+        if layer_past is not None or return_present:
+            return x, present
+        return x
+class GPT(nn.Module):
+    """  the full GPT language model, with a context size of block_size """
+    def __init__(self, vocab_size, block_size, n_layer=12, n_head=8, n_embd=256,
+                 embd_pdrop=0., resid_pdrop=0., attn_pdrop=0., n_unmasked=0):
+        super().__init__()
+        config = GPTConfig(vocab_size=vocab_size, block_size=block_size,
+                           embd_pdrop=embd_pdrop, resid_pdrop=resid_pdrop, attn_pdrop=attn_pdrop,
+                           n_layer=n_layer, n_head=n_head, n_embd=n_embd,
+                           n_unmasked=n_unmasked)
+        # input embedding stem
+        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
+        self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
+        self.drop = nn.Dropout(config.embd_pdrop)
+        # transformer
+        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
+        # decoder head
+        self.ln_f = nn.LayerNorm(config.n_embd)
+        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.block_size = config.block_size
+        self.apply(self._init_weights)
+        self.config = config
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+    def get_block_size(self):
+        return self.block_size
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, idx, embeddings=None, targets=None):
+        # forward the GPT model
+        token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector
+        if embeddings is not None: # prepend explicit embeddings
+            token_embeddings = torch.cat((embeddings, token_embeddings), dim=1)
+        t = token_embeddings.shape[1]
+        assert t <= self.block_size, "Cannot forward, model block size is exhausted."
+        position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector
+        x = self.drop(token_embeddings + position_embeddings)
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.head(x)
+        # if we are given some desired targets also calculate the loss
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+    def forward_with_past(self, idx, embeddings=None, targets=None, past=None, past_length=None):
+        # inference only
+        assert not self.training
+        token_embeddings = self.tok_emb(idx)    # each index maps to a (learnable) vector
+        if embeddings is not None:              # prepend explicit embeddings
+            token_embeddings = torch.cat((embeddings, token_embeddings), dim=1)
+        if past is not None:
+            assert past_length is not None
+            past = torch.cat(past, dim=-2)   # n_layer, 2, b, nh, len_past, dim_head
+            past_shape = list(past.shape)
+            expected_shape = [self.config.n_layer, 2, idx.shape[0], self.config.n_head, past_length, self.config.n_embd//self.config.n_head]
+            assert past_shape == expected_shape, f"{past_shape} =/= {expected_shape}"
+            position_embeddings = self.pos_emb[:, past_length, :]  # each position maps to a (learnable) vector
+        else:
+            position_embeddings = self.pos_emb[:, :token_embeddings.shape[1], :]
+        x = self.drop(token_embeddings + position_embeddings)
+        presents = []  # accumulate over layers
+        for i, block in enumerate(self.blocks):
+            x, present = block(x, layer_past=past[i, ...] if past is not None else None, return_present=True)
+            presents.append(present)
+        x = self.ln_f(x)
+        logits = self.head(x)
+        # if we are given some desired targets also calculate the loss
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss, torch.stack(presents)  # _, _, n_layer, 2, b, nh, 1, dim_head
+class DummyGPT(nn.Module):
+    # for debugging
+    def __init__(self, add_value=1):
+        super().__init__()
+        self.add_value = add_value
+    def forward(self, idx):
+        return idx + self.add_value, None
+class CodeGPT(nn.Module):
+    """Takes in semi-embeddings"""
+    def __init__(self, vocab_size, block_size, in_channels, n_layer=12, n_head=8, n_embd=256,
+                 embd_pdrop=0., resid_pdrop=0., attn_pdrop=0., n_unmasked=0):
+        super().__init__()
+        config = GPTConfig(vocab_size=vocab_size, block_size=block_size,
+                           embd_pdrop=embd_pdrop, resid_pdrop=resid_pdrop, attn_pdrop=attn_pdrop,
+                           n_layer=n_layer, n_head=n_head, n_embd=n_embd,
+                           n_unmasked=n_unmasked)
+        # input embedding stem
+        self.tok_emb = nn.Linear(in_channels, config.n_embd)
+        self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
+        self.drop = nn.Dropout(config.embd_pdrop)
+        # transformer
+        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
+        # decoder head
+        self.ln_f = nn.LayerNorm(config.n_embd)
+        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.block_size = config.block_size
+        self.apply(self._init_weights)
+        self.config = config
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+    def get_block_size(self):
+        return self.block_size
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, idx, embeddings=None, targets=None):
+        # forward the GPT model
+        token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector
+        if embeddings is not None: # prepend explicit embeddings
+            token_embeddings = torch.cat((embeddings, token_embeddings), dim=1)
+        t = token_embeddings.shape[1]
+        assert t <= self.block_size, "Cannot forward, model block size is exhausted."
+        position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector
+        x = self.drop(token_embeddings + position_embeddings)
+        x = self.blocks(x)
+        x = self.taming_cinln_f(x)
+        logits = self.head(x)
+        # if we are given some desired targets also calculate the loss
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+#### sampling utils
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    """
+    take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in
+    the sequence, feeding the predictions back into the model each time. Clearly the sampling
+    has quadratic complexity unlike an RNN that is only linear, and has a finite context window
+    of block_size, unlike an RNN that has an infinite context window.
+    """
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]  # crop context if needed
+        logits, _ = model(x_cond)
+        # pluck the logits at the final step and scale by temperature
+        logits = logits[:, -1, :] / temperature
+        # optionally crop probabilities to only the top k options
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        # apply softmax to convert to probabilities
+        probs = F.softmax(logits, dim=-1)
+        # sample from the distribution or take the most likely
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        # append to the sequence and continue
+        x = torch.cat((x, ix), dim=1)
+    return x
+@torch.no_grad()
+def sample_with_past(x, model, steps, temperature=1., sample_logits=True,
+                     top_k=None, top_p=None, callback=None):
+    # x is conditioning
+    sample = x
+    cond_len = x.shape[1]
+    past = None
+    for n in range(steps):
+        if callback is not None:
+            callback(n)
+        logits, _, present = model.forward_with_past(x, past=past, past_length=(n+cond_len-1))
+        if past is None:
+            past = [present]
+        else:
+            past.append(present)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+        probs = F.softmax(logits, dim=-1)
+        if not sample_logits:
+            _, x = torch.topk(probs, k=1, dim=-1)
+        else:
+            x = torch.multinomial(probs, num_samples=1)
+        # append to the sequence and continue
+        sample = torch.cat((sample, x), dim=1)
+    del past
+    sample = sample[:, cond_len:]  # cut conditioning off
+    return sample
+#### clustering utils
+class KMeans(nn.Module):
+    def __init__(self, ncluster=512, nc=3, niter=10):
+        super().__init__()
+        self.ncluster = ncluster
+        self.nc = nc
+        self.niter = niter
+        self.shape = (3,32,32)
+        self.register_buffer("C", torch.zeros(self.ncluster,nc))
+        self.register_buffer('initialized', torch.tensor(0, dtype=torch.uint8))
+    def is_initialized(self):
+        return self.initialized.item() == 1
+    @torch.no_grad()
+    def initialize(self, x):
+        N, D = x.shape
+        assert D == self.nc, D
+        c = x[torch.randperm(N)[:self.ncluster]] # init clusters at random
+        for i in range(self.niter):
+            # assign all pixels to the closest codebook element
+            a = ((x[:, None, :] - c[None, :, :])**2).sum(-1).argmin(1)
+            # move each codebook element to be the mean of the pixels that assigned to it
+            c = torch.stack([x[a==k].mean(0) for k in range(self.ncluster)])
+            # re-assign any poorly positioned codebook elements
+            nanix = torch.any(torch.isnan(c), dim=1)
+            ndead = nanix.sum().item()
+            print('done step %d/%d, re-initialized %d dead clusters' % (i+1, self.niter, ndead))
+            c[nanix] = x[torch.randperm(N)[:ndead]] # re-init dead clusters
+        self.C.copy_(c)
+        self.initialized.fill_(1)
+    def forward(self, x, reverse=False, shape=None):
+        if not reverse:
+            # flatten
+            bs,c,h,w = x.shape
+            assert c == self.nc
+            x = x.reshape(bs,c,h*w,1)
+            C = self.C.permute(1,0)
+            C = C.reshape(1,c,1,self.ncluster)
+            a = ((x-C)**2).sum(1).argmin(-1) # bs, h*w indices
+            return a
+        else:
+            # flatten
+            bs, HW = x.shape
+            """
+            c = self.C.reshape( 1, self.nc,  1, self.ncluster)
+            c = c[bs*[0],:,:,:]
+            c = c[:,:,HW*[0],:]
+            x =      x.reshape(bs,       1, HW,             1)
+            x = x[:,3*[0],:,:]
+            x = torch.gather(c, dim=3, index=x)
+            """
+            x = self.C[x]
+            x = x.permute(0,2,1)
+            shape = shape if shape is not None else self.shape
+            x = x.reshape(bs, *shape)
+            return x

taming/modules/transformer/permuter.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import torch
+import torch.nn as nn
+import numpy as np
+class AbstractPermuter(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+    def forward(self, x, reverse=False):
+        raise NotImplementedError
+class Identity(AbstractPermuter):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, reverse=False):
+        return x
+class Subsample(AbstractPermuter):
+    def __init__(self, H, W):
+        super().__init__()
+        C = 1
+        indices = np.arange(H*W).reshape(C,H,W)
+        while min(H, W) > 1:
+            indices = indices.reshape(C,H//2,2,W//2,2)
+            indices = indices.transpose(0,2,4,1,3)
+            indices = indices.reshape(C*4,H//2, W//2)
+            H = H//2
+            W = W//2
+            C = C*4
+        assert H == W == 1
+        idx = torch.tensor(indices.ravel())
+        self.register_buffer('forward_shuffle_idx',
+                             nn.Parameter(idx, requires_grad=False))
+        self.register_buffer('backward_shuffle_idx',
+                             nn.Parameter(torch.argsort(idx), requires_grad=False))
+    def forward(self, x, reverse=False):
+        if not reverse:
+            return x[:, self.forward_shuffle_idx]
+        else:
+            return x[:, self.backward_shuffle_idx]
+def mortonify(i, j):
+    """(i,j) index to linear morton code"""
+    i = np.uint64(i)
+    j = np.uint64(j)
+    z = np.uint(0)
+    for pos in range(32):
+        z = (z |
+             ((j & (np.uint64(1) << np.uint64(pos))) << np.uint64(pos)) |
+             ((i & (np.uint64(1) << np.uint64(pos))) << np.uint64(pos+1))
+             )
+    return z
+class ZCurve(AbstractPermuter):
+    def __init__(self, H, W):
+        super().__init__()
+        reverseidx = [np.int64(mortonify(i,j)) for i in range(H) for j in range(W)]
+        idx = np.argsort(reverseidx)
+        idx = torch.tensor(idx)
+        reverseidx = torch.tensor(reverseidx)
+        self.register_buffer('forward_shuffle_idx',
+                             idx)
+        self.register_buffer('backward_shuffle_idx',
+                             reverseidx)
+    def forward(self, x, reverse=False):
+        if not reverse:
+            return x[:, self.forward_shuffle_idx]
+        else:
+            return x[:, self.backward_shuffle_idx]
+class SpiralOut(AbstractPermuter):
+    def __init__(self, H, W):
+        super().__init__()
+        assert H == W
+        size = W
+        indices = np.arange(size*size).reshape(size,size)
+        i0 = size//2
+        j0 = size//2-1
+        i = i0
+        j = j0
+        idx = [indices[i0, j0]]
+        step_mult = 0
+        for c in range(1, size//2+1):
+            step_mult += 1
+            # steps left
+            for k in range(step_mult):
+                i = i - 1
+                j = j
+                idx.append(indices[i, j])
+            # step down
+            for k in range(step_mult):
+                i = i
+                j = j + 1
+                idx.append(indices[i, j])
+            step_mult += 1
+            if c < size//2:
+                # step right
+                for k in range(step_mult):
+                    i = i + 1
+                    j = j
+                    idx.append(indices[i, j])
+                # step up
+                for k in range(step_mult):
+                    i = i
+                    j = j - 1
+                    idx.append(indices[i, j])
+            else:
+                # end reached
+                for k in range(step_mult-1):
+                    i = i + 1
+                    idx.append(indices[i, j])
+        assert len(idx) == size*size
+        idx = torch.tensor(idx)
+        self.register_buffer('forward_shuffle_idx', idx)
+        self.register_buffer('backward_shuffle_idx', torch.argsort(idx))
+    def forward(self, x, reverse=False):
+        if not reverse:
+            return x[:, self.forward_shuffle_idx]
+        else:
+            return x[:, self.backward_shuffle_idx]
+class SpiralIn(AbstractPermuter):
+    def __init__(self, H, W):
+        super().__init__()
+        assert H == W
+        size = W
+        indices = np.arange(size*size).reshape(size,size)
+        i0 = size//2
+        j0 = size//2-1
+        i = i0
+        j = j0
+        idx = [indices[i0, j0]]
+        step_mult = 0
+        for c in range(1, size//2+1):
+            step_mult += 1
+            # steps left
+            for k in range(step_mult):
+                i = i - 1
+                j = j
+                idx.append(indices[i, j])
+            # step down
+            for k in range(step_mult):
+                i = i
+                j = j + 1
+                idx.append(indices[i, j])
+            step_mult += 1
+            if c < size//2:
+                # step right
+                for k in range(step_mult):
+                    i = i + 1
+                    j = j
+                    idx.append(indices[i, j])
+                # step up
+                for k in range(step_mult):
+                    i = i
+                    j = j - 1
+                    idx.append(indices[i, j])
+            else:
+                # end reached
+                for k in range(step_mult-1):
+                    i = i + 1
+                    idx.append(indices[i, j])
+        assert len(idx) == size*size
+        idx = idx[::-1]
+        idx = torch.tensor(idx)
+        self.register_buffer('forward_shuffle_idx', idx)
+        self.register_buffer('backward_shuffle_idx', torch.argsort(idx))
+    def forward(self, x, reverse=False):
+        if not reverse:
+            return x[:, self.forward_shuffle_idx]
+        else:
+            return x[:, self.backward_shuffle_idx]
+class Random(nn.Module):
+    def __init__(self, H, W):
+        super().__init__()
+        indices = np.random.RandomState(1).permutation(H*W)
+        idx = torch.tensor(indices.ravel())
+        self.register_buffer('forward_shuffle_idx', idx)
+        self.register_buffer('backward_shuffle_idx', torch.argsort(idx))
+    def forward(self, x, reverse=False):
+        if not reverse:
+            return x[:, self.forward_shuffle_idx]
+        else:
+            return x[:, self.backward_shuffle_idx]
+class AlternateParsing(AbstractPermuter):
+    def __init__(self, H, W):
+        super().__init__()
+        indices = np.arange(W*H).reshape(H,W)
+        for i in range(1, H, 2):
+            indices[i, :] = indices[i, ::-1]
+        idx = indices.flatten()
+        assert len(idx) == H*W
+        idx = torch.tensor(idx)
+        self.register_buffer('forward_shuffle_idx', idx)
+        self.register_buffer('backward_shuffle_idx', torch.argsort(idx))
+    def forward(self, x, reverse=False):
+        if not reverse:
+            return x[:, self.forward_shuffle_idx]
+        else:
+            return x[:, self.backward_shuffle_idx]
+if __name__ == "__main__":
+    p0 = AlternateParsing(16, 16)
+    print(p0.forward_shuffle_idx)
+    print(p0.backward_shuffle_idx)
+    x = torch.randint(0, 768, size=(11, 256))
+    y = p0(x)
+    xre = p0(y, reverse=True)
+    assert torch.equal(x, xre)
+    p1 = SpiralOut(2, 2)
+    print(p1.forward_shuffle_idx)
+    print(p1.backward_shuffle_idx)

taming/modules/util.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import torch
+import torch.nn as nn
+def count_params(model):
+    total_params = sum(p.numel() for p in model.parameters())
+    return total_params
+class ActNorm(nn.Module):
+    def __init__(self, num_features, logdet=False, affine=True,
+                 allow_reverse_init=False):
+        assert affine
+        super().__init__()
+        self.logdet = logdet
+        self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1))
+        self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1))
+        self.allow_reverse_init = allow_reverse_init
+        self.register_buffer('initialized', torch.tensor(0, dtype=torch.uint8))
+    def initialize(self, input):
+        with torch.no_grad():
+            flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1)
+            mean = (
+                flatten.mean(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            std = (
+                flatten.std(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            self.loc.data.copy_(-mean)
+            self.scale.data.copy_(1 / (std + 1e-6))
+    def forward(self, input, reverse=False):
+        if reverse:
+            return self.reverse(input)
+        if len(input.shape) == 2:
+            input = input[:,:,None,None]
+            squeeze = True
+        else:
+            squeeze = False
+        _, _, height, width = input.shape
+        if self.training and self.initialized.item() == 0:
+            self.initialize(input)
+            self.initialized.fill_(1)
+        h = self.scale * (input + self.loc)
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        if self.logdet:
+            log_abs = torch.log(torch.abs(self.scale))
+            logdet = height*width*torch.sum(log_abs)
+            logdet = logdet * torch.ones(input.shape[0]).to(input)
+            return h, logdet
+        return h
+    def reverse(self, output):
+        if self.training and self.initialized.item() == 0:
+            if not self.allow_reverse_init:
+                raise RuntimeError(
+                    "Initializing ActNorm in reverse direction is "
+                    "disabled by default. Use allow_reverse_init=True to enable."
+                )
+            else:
+                self.initialize(output)
+                self.initialized.fill_(1)
+        if len(output.shape) == 2:
+            output = output[:,:,None,None]
+            squeeze = True
+        else:
+            squeeze = False
+        h = output / self.scale - self.loc
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        return h
+class AbstractEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+class Labelator(AbstractEncoder):
+    """Net2Net Interface for Class-Conditional Model"""
+    def __init__(self, n_classes, quantize_interface=True):
+        super().__init__()
+        self.n_classes = n_classes
+        self.quantize_interface = quantize_interface
+    def encode(self, c):
+        c = c[:,None]
+        if self.quantize_interface:
+            return c, None, [None, None, c.long()]
+        return c
+class SOSProvider(AbstractEncoder):
+    # for unconditional training
+    def __init__(self, sos_token, quantize_interface=True):
+        super().__init__()
+        self.sos_token = sos_token
+        self.quantize_interface = quantize_interface
+    def encode(self, x):
+        # get batch size from data and replicate sos_token
+        c = torch.ones(x.shape[0], 1)*self.sos_token
+        c = c.long().to(x.device)
+        if self.quantize_interface:
+            return c, None, [None, None, c]
+        return c

taming/modules/vqvae/quantize.py ADDED Viewed

	@@ -0,0 +1,445 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from torch import einsum
+from einops import rearrange
+class VectorQuantizer(nn.Module):
+    """
+    see https://github.com/MishaLaskin/vqvae/blob/d761a999e2267766400dc646d82d3ac3657771d4/models/quantizer.py
+    ____________________________________________
+    Discretization bottleneck part of the VQ-VAE.
+    Inputs:
+    - n_e : number of embeddings
+    - e_dim : dimension of embedding
+    - beta : commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2
+    _____________________________________________
+    """
+    # NOTE: this class contains a bug regarding beta; see VectorQuantizer2 for
+    # a fix and use legacy=False to apply that fix. VectorQuantizer2 can be
+    # used wherever VectorQuantizer has been used before and is additionally
+    # more efficient.
+    def __init__(self, n_e, e_dim, beta):
+        super(VectorQuantizer, self).__init__()
+        self.n_e = n_e
+        self.e_dim = e_dim
+        self.beta = beta
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+    def forward(self, z):
+        """
+        Inputs the output of the encoder network z and maps it to a discrete
+        one-hot vector that is the index of the closest embedding vector e_j
+        z (continuous) -> z_q (discrete)
+        z.shape = (batch, channel, height, width)
+        quantization pipeline:
+            1. get encoder input (B,C,H,W)
+            2. flatten input to (B*H*W,C)
+        """
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = z.permute(0, 2, 3, 1).contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight**2, dim=1) - 2 * \
+            torch.matmul(z_flattened, self.embedding.weight.t())
+        ## could possible replace this here
+        # #\start...
+        # find closest encodings
+        min_encoding_indices = torch.argmin(d, dim=1).unsqueeze(1)
+        min_encodings = torch.zeros(
+            min_encoding_indices.shape[0], self.n_e).to(z)
+        min_encodings.scatter_(1, min_encoding_indices, 1)
+        # dtype min encodings: torch.float32
+        # min_encodings shape: torch.Size([2048, 512])
+        # min_encoding_indices.shape: torch.Size([2048, 1])
+        # get quantized latent vectors
+        z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape)
+        #.........\end
+        # with:
+        # .........\start
+        #min_encoding_indices = torch.argmin(d, dim=1)
+        #z_q = self.embedding(min_encoding_indices)
+        # ......\end......... (TODO)
+        # compute loss for embedding
+        loss = torch.mean((z_q.detach()-z)**2) + self.beta * \
+            torch.mean((z_q - z.detach()) ** 2)
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        # perplexity
+        e_mean = torch.mean(min_encodings, dim=0)
+        perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10)))
+        # reshape back to match original input shape
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
+    def get_codebook_entry(self, indices, shape):
+        # shape specifying (batch, height, width, channel)
+        # TODO: check for more easy handling with nn.Embedding
+        min_encodings = torch.zeros(indices.shape[0], self.n_e).to(indices)
+        min_encodings.scatter_(1, indices[:,None], 1)
+        # get quantized latent vectors
+        z_q = torch.matmul(min_encodings.float(), self.embedding.weight)
+        if shape is not None:
+            z_q = z_q.view(shape)
+            # reshape back to match original input shape
+            z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q
+class GumbelQuantize(nn.Module):
+    """
+    credit to @karpathy: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py (thanks!)
+    Gumbel Softmax trick quantizer
+    Categorical Reparameterization with Gumbel-Softmax, Jang et al. 2016
+    https://arxiv.org/abs/1611.01144
+    """
+    def __init__(self, num_hiddens, embedding_dim, n_embed, straight_through=True,
+                 kl_weight=5e-4, temp_init=1.0, use_vqinterface=True,
+                 remap=None, unknown_index="random"):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.n_embed = n_embed
+        self.straight_through = straight_through
+        self.temperature = temp_init
+        self.kl_weight = kl_weight
+        self.proj = nn.Conv2d(num_hiddens, n_embed, 1)
+        self.embed = nn.Embedding(n_embed, embedding_dim)
+        self.use_vqinterface = use_vqinterface
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.re_embed = self.used.shape[0]
+            self.unknown_index = unknown_index # "random" or "extra" or integer
+            if self.unknown_index == "extra":
+                self.unknown_index = self.re_embed
+                self.re_embed = self.re_embed+1
+            print(f"Remapping {self.n_embed} indices to {self.re_embed} indices. "
+                  f"Using {self.unknown_index} for unknown indices.")
+        else:
+            self.re_embed = n_embed
+    def remap_to_used(self, inds):
+        ishape = inds.shape
+        assert len(ishape)>1
+        inds = inds.reshape(ishape[0],-1)
+        used = self.used.to(inds)
+        match = (inds[:,:,None]==used[None,None,...]).long()
+        new = match.argmax(-1)
+        unknown = match.sum(2)<1
+        if self.unknown_index == "random":
+            new[unknown]=torch.randint(0,self.re_embed,size=new[unknown].shape).to(device=new.device)
+        else:
+            new[unknown] = self.unknown_index
+        return new.reshape(ishape)
+    def unmap_to_all(self, inds):
+        ishape = inds.shape
+        assert len(ishape)>1
+        inds = inds.reshape(ishape[0],-1)
+        used = self.used.to(inds)
+        if self.re_embed > self.used.shape[0]: # extra token
+            inds[inds>=self.used.shape[0]] = 0 # simply set to zero
+        back=torch.gather(used[None,:][inds.shape[0]*[0],:], 1, inds)
+        return back.reshape(ishape)
+    def forward(self, z, temp=None, return_logits=False):
+        # force hard = True when we are in eval mode, as we must quantize. actually, always true seems to work
+        hard = self.straight_through if self.training else True
+        temp = self.temperature if temp is None else temp
+        logits = self.proj(z)
+        if self.remap is not None:
+            # continue only with used logits
+            full_zeros = torch.zeros_like(logits)
+            logits = logits[:,self.used,...]
+        soft_one_hot = F.gumbel_softmax(logits, tau=temp, dim=1, hard=hard)
+        if self.remap is not None:
+            # go back to all entries but unused set to zero
+            full_zeros[:,self.used,...] = soft_one_hot
+            soft_one_hot = full_zeros
+        z_q = einsum('b n h w, n d -> b d h w', soft_one_hot, self.embed.weight)
+        # + kl divergence to the prior loss
+        qy = F.softmax(logits, dim=1)
+        diff = self.kl_weight * torch.sum(qy * torch.log(qy * self.n_embed + 1e-10), dim=1).mean()
+        ind = soft_one_hot.argmax(dim=1)
+        if self.remap is not None:
+            ind = self.remap_to_used(ind)
+        if self.use_vqinterface:
+            if return_logits:
+                return z_q, diff, (None, None, ind), logits
+            return z_q, diff, (None, None, ind)
+        return z_q, diff, ind
+    def get_codebook_entry(self, indices, shape):
+        b, h, w, c = shape
+        assert b*h*w == indices.shape[0]
+        indices = rearrange(indices, '(b h w) -> b h w', b=b, h=h, w=w)
+        if self.remap is not None:
+            indices = self.unmap_to_all(indices)
+        one_hot = F.one_hot(indices, num_classes=self.n_embed).permute(0, 3, 1, 2).float()
+        z_q = einsum('b n h w, n d -> b d h w', one_hot, self.embed.weight)
+        return z_q
+class VectorQuantizer2(nn.Module):
+    """
+    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly
+    avoids costly matrix multiplications and allows for post-hoc remapping of indices.
+    """
+    # NOTE: due to a bug the beta term was applied to the wrong term. for
+    # backwards compatibility we use the buggy version by default, but you can
+    # specify legacy=False to fix it.
+    def __init__(self, n_e, e_dim, beta, remap=None, unknown_index="random",
+                 sane_index_shape=False, legacy=True):
+        super().__init__()
+        self.n_e = n_e
+        self.e_dim = e_dim
+        self.beta = beta
+        self.legacy = legacy
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.re_embed = self.used.shape[0]
+            self.unknown_index = unknown_index # "random" or "extra" or integer
+            if self.unknown_index == "extra":
+                self.unknown_index = self.re_embed
+                self.re_embed = self.re_embed+1
+            print(f"Remapping {self.n_e} indices to {self.re_embed} indices. "
+                  f"Using {self.unknown_index} for unknown indices.")
+        else:
+            self.re_embed = n_e
+        self.sane_index_shape = sane_index_shape
+    def remap_to_used(self, inds):
+        ishape = inds.shape
+        assert len(ishape)>1
+        inds = inds.reshape(ishape[0],-1)
+        used = self.used.to(inds)
+        match = (inds[:,:,None]==used[None,None,...]).long()
+        new = match.argmax(-1)
+        unknown = match.sum(2)<1
+        if self.unknown_index == "random":
+            new[unknown]=torch.randint(0,self.re_embed,size=new[unknown].shape).to(device=new.device)
+        else:
+            new[unknown] = self.unknown_index
+        return new.reshape(ishape)
+    def unmap_to_all(self, inds):
+        ishape = inds.shape
+        assert len(ishape)>1
+        inds = inds.reshape(ishape[0],-1)
+        used = self.used.to(inds)
+        if self.re_embed > self.used.shape[0]: # extra token
+            inds[inds>=self.used.shape[0]] = 0 # simply set to zero
+        back=torch.gather(used[None,:][inds.shape[0]*[0],:], 1, inds)
+        return back.reshape(ishape)
+    def forward(self, z, temp=None, rescale_logits=False, return_logits=False):
+        assert temp is None or temp==1.0, "Only for interface compatible with Gumbel"
+        assert rescale_logits==False, "Only for interface compatible with Gumbel"
+        assert return_logits==False, "Only for interface compatible with Gumbel"
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = rearrange(z, 'b c h w -> b h w c').contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight**2, dim=1) - 2 * \
+            torch.einsum('bd,dn->bn', z_flattened, rearrange(self.embedding.weight, 'n d -> d n'))
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        perplexity = None
+        min_encodings = None
+        # compute loss for embedding
+        if not self.legacy:
+            loss = self.beta * torch.mean((z_q.detach()-z)**2) + \
+                   torch.mean((z_q - z.detach()) ** 2)
+        else:
+            loss = torch.mean((z_q.detach()-z)**2) + self.beta * \
+                   torch.mean((z_q - z.detach()) ** 2)
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        # reshape back to match original input shape
+        z_q = rearrange(z_q, 'b h w c -> b c h w').contiguous()
+        if self.remap is not None:
+            min_encoding_indices = min_encoding_indices.reshape(z.shape[0],-1) # add batch axis
+            min_encoding_indices = self.remap_to_used(min_encoding_indices)
+            min_encoding_indices = min_encoding_indices.reshape(-1,1) # flatten
+        if self.sane_index_shape:
+            min_encoding_indices = min_encoding_indices.reshape(
+                z_q.shape[0], z_q.shape[2], z_q.shape[3])
+        return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
+    def get_codebook_entry(self, indices, shape):
+        # shape specifying (batch, height, width, channel)
+        if self.remap is not None:
+            indices = indices.reshape(shape[0],-1) # add batch axis
+            indices = self.unmap_to_all(indices)
+            indices = indices.reshape(-1) # flatten again
+        # get quantized latent vectors
+        z_q = self.embedding(indices)
+        if shape is not None:
+            z_q = z_q.view(shape)
+            # reshape back to match original input shape
+            z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q
+class EmbeddingEMA(nn.Module):
+    def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5):
+        super().__init__()
+        self.decay = decay
+        self.eps = eps
+        weight = torch.randn(num_tokens, codebook_dim)
+        self.weight = nn.Parameter(weight, requires_grad = False)
+        self.cluster_size = nn.Parameter(torch.zeros(num_tokens), requires_grad = False)
+        self.embed_avg = nn.Parameter(weight.clone(), requires_grad = False)
+        self.update = True
+    def forward(self, embed_id):
+        return F.embedding(embed_id, self.weight)
+    def cluster_size_ema_update(self, new_cluster_size):
+        self.cluster_size.data.mul_(self.decay).add_(new_cluster_size, alpha=1 - self.decay)
+    def embed_avg_ema_update(self, new_embed_avg):
+        self.embed_avg.data.mul_(self.decay).add_(new_embed_avg, alpha=1 - self.decay)
+    def weight_update(self, num_tokens):
+        n = self.cluster_size.sum()
+        smoothed_cluster_size = (
+                (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n
+            )
+        #normalize embedding average with smoothed cluster size
+        embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1)
+        self.weight.data.copy_(embed_normalized)
+class EMAVectorQuantizer(nn.Module):
+    def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5,
+                remap=None, unknown_index="random"):
+        super().__init__()
+        self.codebook_dim = codebook_dim
+        self.num_tokens = num_tokens
+        self.beta = beta
+        self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps)
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.re_embed = self.used.shape[0]
+            self.unknown_index = unknown_index # "random" or "extra" or integer
+            if self.unknown_index == "extra":
+                self.unknown_index = self.re_embed
+                self.re_embed = self.re_embed+1
+            print(f"Remapping {self.n_embed} indices to {self.re_embed} indices. "
+                  f"Using {self.unknown_index} for unknown indices.")
+        else:
+            self.re_embed = n_embed
+    def remap_to_used(self, inds):
+        ishape = inds.shape
+        assert len(ishape)>1
+        inds = inds.reshape(ishape[0],-1)
+        used = self.used.to(inds)
+        match = (inds[:,:,None]==used[None,None,...]).long()
+        new = match.argmax(-1)
+        unknown = match.sum(2)<1
+        if self.unknown_index == "random":
+            new[unknown]=torch.randint(0,self.re_embed,size=new[unknown].shape).to(device=new.device)
+        else:
+            new[unknown] = self.unknown_index
+        return new.reshape(ishape)
+    def unmap_to_all(self, inds):
+        ishape = inds.shape
+        assert len(ishape)>1
+        inds = inds.reshape(ishape[0],-1)
+        used = self.used.to(inds)
+        if self.re_embed > self.used.shape[0]: # extra token
+            inds[inds>=self.used.shape[0]] = 0 # simply set to zero
+        back=torch.gather(used[None,:][inds.shape[0]*[0],:], 1, inds)
+        return back.reshape(ishape)
+    def forward(self, z):
+        # reshape z -> (batch, height, width, channel) and flatten
+        #z, 'b c h w -> b h w c'
+        z = rearrange(z, 'b c h w -> b h w c')
+        z_flattened = z.reshape(-1, self.codebook_dim)
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        d = z_flattened.pow(2).sum(dim=1, keepdim=True) + \
+            self.embedding.weight.pow(2).sum(dim=1) - 2 * \
+            torch.einsum('bd,nd->bn', z_flattened, self.embedding.weight) # 'n d -> d n'
+        encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(encoding_indices).view(z.shape)
+        encodings = F.one_hot(encoding_indices, self.num_tokens).type(z.dtype)
+        avg_probs = torch.mean(encodings, dim=0)
+        perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
+        if self.training and self.embedding.update:
+            #EMA cluster size
+            encodings_sum = encodings.sum(0)
+            self.embedding.cluster_size_ema_update(encodings_sum)
+            #EMA embedding average
+            embed_sum = encodings.transpose(0,1) @ z_flattened
+            self.embedding.embed_avg_ema_update(embed_sum)
+            #normalize embed_avg and update weight
+            self.embedding.weight_update(self.num_tokens)
+        # compute loss for embedding
+        loss = self.beta * F.mse_loss(z_q.detach(), z)
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        # reshape back to match original input shape
+        #z_q, 'b h w c -> b c h w'
+        z_q = rearrange(z_q, 'b h w c -> b c h w')
+        return z_q, loss, (perplexity, encodings, encoding_indices)

taming/util.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import os, hashlib
+import requests
+from tqdm import tqdm
+URL_MAP = {
+    "vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"
+}
+CKPT_MAP = {
+    "vgg_lpips": "vgg.pth"
+}
+MD5_MAP = {
+    "vgg_lpips": "d507d7349b931f0638a25a48a722f98a"
+}
+def download(url, local_path, chunk_size=1024):
+    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        total_size = int(r.headers.get("content-length", 0))
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            with open(local_path, "wb") as f:
+                for data in r.iter_content(chunk_size=chunk_size):
+                    if data:
+                        f.write(data)
+                        pbar.update(chunk_size)
+def md5_hash(path):
+    with open(path, "rb") as f:
+        content = f.read()
+    return hashlib.md5(content).hexdigest()
+def get_ckpt_path(name, root, check=False):
+    assert name in URL_MAP
+    path = os.path.join(root, CKPT_MAP[name])
+    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+        download(URL_MAP[name], path)
+        md5 = md5_hash(path)
+        assert md5 == MD5_MAP[name], md5
+    return path
+class KeyNotFoundError(Exception):
+    def __init__(self, cause, keys=None, visited=None):
+        self.cause = cause
+        self.keys = keys
+        self.visited = visited
+        messages = list()
+        if keys is not None:
+            messages.append("Key not found: {}".format(keys))
+        if visited is not None:
+            messages.append("Visited: {}".format(visited))
+        messages.append("Cause:\n{}".format(cause))
+        message = "\n".join(messages)
+        super().__init__(message)
+def retrieve(
+    list_or_dict, key, splitval="/", default=None, expand=True, pass_success=False
+):
+    """Given a nested list or dict return the desired value at key expanding
+    callable nodes if necessary and :attr:`expand` is ``True``. The expansion
+    is done in-place.
+    Parameters
+    ----------
+        list_or_dict : list or dict
+            Possibly nested list or dictionary.
+        key : str
+            key/to/value, path like string describing all keys necessary to
+            consider to get to the desired value. List indices can also be
+            passed here.
+        splitval : str
+            String that defines the delimiter between keys of the
+            different depth levels in `key`.
+        default : obj
+            Value returned if :attr:`key` is not found.
+        expand : bool
+            Whether to expand callable nodes on the path or not.
+    Returns
+    -------
+        The desired value or if :attr:`default` is not ``None`` and the
+        :attr:`key` is not found returns ``default``.
+    Raises
+    ------
+        Exception if ``key`` not in ``list_or_dict`` and :attr:`default` is
+        ``None``.
+    """
+    keys = key.split(splitval)
+    success = True
+    try:
+        visited = []
+        parent = None
+        last_key = None
+        for key in keys:
+            if callable(list_or_dict):
+                if not expand:
+                    raise KeyNotFoundError(
+                        ValueError(
+                            "Trying to get past callable node with expand=False."
+                        ),
+                        keys=keys,
+                        visited=visited,
+                    )
+                list_or_dict = list_or_dict()
+                parent[last_key] = list_or_dict
+            last_key = key
+            parent = list_or_dict
+            try:
+                if isinstance(list_or_dict, dict):
+                    list_or_dict = list_or_dict[key]
+                else:
+                    list_or_dict = list_or_dict[int(key)]
+            except (KeyError, IndexError, ValueError) as e:
+                raise KeyNotFoundError(e, keys=keys, visited=visited)
+            visited += [key]
+        # final expansion of retrieved value
+        if expand and callable(list_or_dict):
+            list_or_dict = list_or_dict()
+            parent[last_key] = list_or_dict
+    except KeyNotFoundError as e:
+        if default is None:
+            raise e
+        else:
+            list_or_dict = default
+            success = False
+    if not pass_success:
+        return list_or_dict
+    else:
+        return list_or_dict, success
+if __name__ == "__main__":
+    config = {"keya": "a",
+              "keyb": "b",
+              "keyc":
+                  {"cc1": 1,
+                   "cc2": 2,
+                   }
+              }
+    from omegaconf import OmegaConf
+    config = OmegaConf.create(config)
+    print(config)
+    retrieve(config, "keya")