Switti-1024

Running on Zero

App Files Files Community

realantonvoronov commited on 10 days ago

Commit

1dc27f0

1 Parent(s): 484ca0e

update for 1024

Browse files

Files changed (5) hide show

app.py +8 -8
models/helpers.py +7 -0
models/pipeline.py +21 -15
models/switti.py +4 -5
models/vqvae.py +3 -2

app.py CHANGED Viewed

@@ -1,16 +1,16 @@
-import gradio as gr
-import numpy as np
 import random
 import spaces
-from models import SwittiPipeline
 import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "yresearch/Switti"
-pipe = SwittiPipeline.from_pretrained(model_repo_id, device=device)
 MAX_SEED = np.iinfo(np.int32).max
@@ -140,9 +140,9 @@ with gr.Blocks(css=css) as demo:
                 turn_off_cfg_start_si = gr.Slider(
                     label="Disable CFG starting scale",
                     minimum=0,
-                    maximum=10,
                     step=1,
-                    value=8,
                 )
             with gr.Row():
                 more_diverse = gr.Checkbox(label="More diverse", value=False)

 import random
+import gradio as gr
+import numpy as np
 import spaces
 import torch
+from models import SwittiPipeline
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_repo_id = "yresearch/Switti-1024"
+pipe = SwittiPipeline.from_pretrained(model_repo_id, device=device, torch_dtype=torch.bfloat16)
 MAX_SEED = np.iinfo(np.int32).max
                 turn_off_cfg_start_si = gr.Slider(
                     label="Disable CFG starting scale",
                     minimum=0,
+                    maximum=14,
                     step=1,
+                    value=11,
                 )
             with gr.Row():
                 more_diverse = gr.Checkbox(label="More diverse", value=False)

models/helpers.py CHANGED Viewed

@@ -3,6 +3,13 @@ from torch import nn as nn
 from torch.nn import functional as F
 def sample_with_top_k_top_p_(
     logits_BlV: torch.Tensor,
     top_k: int = 0,

 from torch.nn import functional as F
+RESOLUTION_PATCH_NUMS_MAPPING = {
+    256: "1_2_3_4_5_6_8_10_13_16",
+    512: "1_2_3_4_6_9_13_18_24_32",
+    1024: "1_2_3_4_5_7_9_12_16_21_27_36_48_64",
+}
 def sample_with_top_k_top_p_(
     logits_BlV: torch.Tensor,
     top_k: int = 0,

models/pipeline.py CHANGED Viewed

@@ -8,14 +8,16 @@ from models.switti import SwittiHF, get_crop_condition
 from models.helpers import sample_with_top_k_top_p_, gumbel_softmax_with_rng
 class SwittiPipeline:
     vae_path = "yresearch/VQVAE-Switti"
     text_encoder_path = "openai/clip-vit-large-patch14"
     text_encoder_2_path = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
-    def __init__(self, switti, vae, text_encoder, text_encoder_2, device,
-                dtype=torch.bfloat16,
-                ):
         self.switti = switti.to(dtype)
         self.vae = vae.to(dtype)
         self.text_encoder = text_encoder.to(dtype)
@@ -27,13 +29,18 @@ class SwittiPipeline:
         self.device = device
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, device="cuda"):
-        switti = SwittiHF.from_pretrained(pretrained_model_name_or_path, device=device).to(device)
-        vae = VQVAEHF.from_pretrained(cls.vae_path).to(device)
         text_encoder = FrozenCLIPEmbedder(cls.text_encoder_path, device=device)
         text_encoder_2 = FrozenCLIPEmbedder(cls.text_encoder_2_path, device=device)
-        return cls(switti, vae, text_encoder, text_encoder_2, device)
     @staticmethod
     def to_image(tensor):
@@ -84,7 +91,7 @@ class SwittiPipeline:
         prompt: str | list[str],
         null_prompt: str = "",
         seed: int | None = None,
-        cfg: float = 4.0,
         top_k: int = 400,
         top_p: float = 0.95,
         more_smooth: bool = False,
@@ -92,8 +99,7 @@ class SwittiPipeline:
         smooth_start_si: int = 0,
         turn_off_cfg_start_si: int = 10,
         turn_on_cfg_start_si: int = 0,
-        image_size: tuple[int, int] = (512, 512),
-        last_scale_temp: float = 1.,
     ) -> torch.Tensor | list[PILImage]:
         """
         only used for inference, on autoregressive mode
@@ -122,8 +128,8 @@ class SwittiPipeline:
         cond_vector = switti.text_pooler(cond_vector)
         if switti.use_crop_cond:
-            crop_coords = get_crop_condition(2 * B * [image_size[0]],
-                                             2 * B * [image_size[1]],
                                              ).to(cond_vector.device)
             crop_embed = switti.crop_embed(crop_coords.view(-1)).reshape(2 * B, switti.D)
             crop_cond = switti.crop_proj(crop_embed)
@@ -169,7 +175,7 @@ class SwittiPipeline:
                     if b.attn.caching and b.attn.cached_k is not None:
                         b.attn.cached_k = b.attn.cached_k[:B]
                         b.attn.cached_v = b.attn.cached_v[:B]
-                    if b.cross_attn.caching  and b.cross_attn.cached_k is not None:
                         b.cross_attn.cached_k = b.cross_attn.cached_k[:B]
                         b.cross_attn.cached_v = b.cross_attn.cached_v[:B]
             else:
@@ -197,7 +203,7 @@ class SwittiPipeline:
                 # default const cfg
                 t = cfg
                 logits_BlV = (1 + t) * logits_BlV[:B] - t * logits_BlV[B:]
-            else:
                 logits_BlV = logits_BlV / last_scale_temp
             if apply_smooth and si >= smooth_start_si:
@@ -208,7 +214,7 @@ class SwittiPipeline:
                 )
                 h_BChw = idx_Bl @ vae_quant.embedding.weight.unsqueeze(0)
             else:
-                # defaul nucleus sampling
                 idx_Bl = sample_with_top_k_top_p_(
                     logits_BlV, rng=rng, top_k=top_k, top_p=top_p, num_samples=1,
                 )[:, :, 0]

 from models.helpers import sample_with_top_k_top_p_, gumbel_softmax_with_rng
+TRAIN_IMAGE_SIZE = (512, 512)
 class SwittiPipeline:
     vae_path = "yresearch/VQVAE-Switti"
     text_encoder_path = "openai/clip-vit-large-patch14"
     text_encoder_2_path = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+    def __init__(self, switti, vae, text_encoder, text_encoder_2,
+                 device, dtype=torch.float32,
+                 ):
         self.switti = switti.to(dtype)
         self.vae = vae.to(dtype)
         self.text_encoder = text_encoder.to(dtype)
         self.device = device
     @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name_or_path,
+                        torch_dtype=torch.bfloat16,
+                        device="cuda",
+                        reso=1024,
+                        ):
+        switti = SwittiHF.from_pretrained(pretrained_model_name_or_path).to(device)
+        vae = VQVAEHF.from_pretrained(cls.vae_path, reso=reso).to(device)
         text_encoder = FrozenCLIPEmbedder(cls.text_encoder_path, device=device)
         text_encoder_2 = FrozenCLIPEmbedder(cls.text_encoder_2_path, device=device)
+        return cls(switti, vae, text_encoder, text_encoder_2, device, torch_dtype)
     @staticmethod
     def to_image(tensor):
         prompt: str | list[str],
         null_prompt: str = "",
         seed: int | None = None,
+        cfg: float = 6.,
         top_k: int = 400,
         top_p: float = 0.95,
         more_smooth: bool = False,
         smooth_start_si: int = 0,
         turn_off_cfg_start_si: int = 10,
         turn_on_cfg_start_si: int = 0,
+        last_scale_temp: None | float = None,
     ) -> torch.Tensor | list[PILImage]:
         """
         only used for inference, on autoregressive mode
         cond_vector = switti.text_pooler(cond_vector)
         if switti.use_crop_cond:
+            crop_coords = get_crop_condition(2 * B * [TRAIN_IMAGE_SIZE[0]],
+                                             2 * B * [TRAIN_IMAGE_SIZE[1]],
                                              ).to(cond_vector.device)
             crop_embed = switti.crop_embed(crop_coords.view(-1)).reshape(2 * B, switti.D)
             crop_cond = switti.crop_proj(crop_embed)
                     if b.attn.caching and b.attn.cached_k is not None:
                         b.attn.cached_k = b.attn.cached_k[:B]
                         b.attn.cached_v = b.attn.cached_v[:B]
+                    if b.cross_attn.caching and b.cross_attn.cached_k is not None:
                         b.cross_attn.cached_k = b.cross_attn.cached_k[:B]
                         b.cross_attn.cached_v = b.cross_attn.cached_v[:B]
             else:
                 # default const cfg
                 t = cfg
                 logits_BlV = (1 + t) * logits_BlV[:B] - t * logits_BlV[B:]
+            elif last_scale_temp is not None:
                 logits_BlV = logits_BlV / last_scale_temp
             if apply_smooth and si >= smooth_start_si:
                 )
                 h_BChw = idx_Bl @ vae_quant.embedding.weight.unsqueeze(0)
             else:
+                # default nucleus sampling
                 idx_Bl = sample_with_top_k_top_p_(
                     logits_BlV, rng=rng, top_k=top_k, top_p=top_p, num_samples=1,
                 )[:, :, 0]

models/switti.py CHANGED Viewed

@@ -9,7 +9,7 @@ from diffusers.models.embeddings import GaussianFourierProjection
 from models.basic_switti import AdaLNBeforeHead, AdaLNSelfCrossAttn
 from models.rope import compute_axial_cis
 def get_crop_condition(
     heights: list,
@@ -53,7 +53,6 @@ class Switti(nn.Module):
         use_swiglu_ffn=True,
         use_ar=False,
         use_crop_cond=True,
-        device='cuda',
     ):
         super().__init__()
         # 0. hyperparameters
@@ -392,20 +391,20 @@ class SwittiHF(Switti, PyTorchModelHubMixin):
         use_swiglu_ffn=True,
         use_ar=False,
         use_crop_cond=True,
-        device='cuda',
     ):
         heads = depth
         width = depth * 64
         super().__init__(
             depth=depth,
             embed_dim=width,
             num_heads=heads,
-            patch_nums=(1, 2, 3, 4, 6, 9, 13, 18, 24, 32),
             rope=rope,
             rope_theta=rope_theta,
             rope_size=rope_size,
             use_swiglu_ffn=use_swiglu_ffn,
             use_ar=use_ar,
             use_crop_cond=use_crop_cond,
-            device=device,
         )

 from models.basic_switti import AdaLNBeforeHead, AdaLNSelfCrossAttn
 from models.rope import compute_axial_cis
+from models.helpers import RESOLUTION_PATCH_NUMS_MAPPING
 def get_crop_condition(
     heights: list,
         use_swiglu_ffn=True,
         use_ar=False,
         use_crop_cond=True,
     ):
         super().__init__()
         # 0. hyperparameters
         use_swiglu_ffn=True,
         use_ar=False,
         use_crop_cond=True,
+        reso=512,
     ):
         heads = depth
         width = depth * 64
+        patch_nums = tuple([int(x) for x in RESOLUTION_PATCH_NUMS_MAPPING[reso].split("_")])
         super().__init__(
             depth=depth,
             embed_dim=width,
             num_heads=heads,
+            patch_nums=patch_nums,
             rope=rope,
             rope_theta=rope_theta,
             rope_size=rope_size,
             use_swiglu_ffn=use_swiglu_ffn,
             use_ar=use_ar,
             use_crop_cond=use_crop_cond,
         )

models/vqvae.py CHANGED Viewed

@@ -13,7 +13,7 @@ from huggingface_hub import PyTorchModelHubMixin
 from .basic_vae import Decoder, Encoder
 from .quant import VectorQuantizer2
 class VQVAE(nn.Module):
@@ -172,8 +172,9 @@ class VQVAEHF(VQVAE, PyTorchModelHubMixin):
         ch=160,
         test_mode=True,
         share_quant_resi=4,
-        v_patch_nums=(1, 2, 3, 4, 6, 9, 13, 18, 24, 32),
     ):
         super().__init__(
             vocab_size=vocab_size,
             z_channels=z_channels,

 from .basic_vae import Decoder, Encoder
 from .quant import VectorQuantizer2
+from models.helpers import RESOLUTION_PATCH_NUMS_MAPPING
 class VQVAE(nn.Module):
         ch=160,
         test_mode=True,
         share_quant_resi=4,
+        reso=1024,
     ):
+        v_patch_nums = tuple((int(x) for x in RESOLUTION_PATCH_NUMS_MAPPING[reso].split("_")))
         super().__init__(
             vocab_size=vocab_size,
             z_channels=z_channels,