Spaces:

MaykaGR
/

cine

Runtime error

App Files Files Community

MaykaGR commited on Feb 3

Commit

078a1ac

verified ·

1 Parent(s): fb0e426

Upload 21 files

Browse files

Files changed (20) hide show

GraphView-CUSGEqGS.js +0 -0
causal_conv3d.py +64 -0
causal_video_autoencoder.py +907 -0
conv_nd_factory.py +82 -0
dual_conv3d.py +195 -0
index-4Hb32CNk.js +0 -0
index-C1Hb_Yo9.css +5129 -0
merges.txt +0 -0
model.py +711 -0
pixel_norm.py +12 -0
put_taesd_encoder_pth_and_taesd_decoder_pth_here +0 -0
put_vae_here +0 -0
vae (1)/causal_conv3d.py +64 -0
vae (1)/causal_video_autoencoder.py +907 -0
vae (1)/conv_nd_factory.py +82 -0
vae (1)/dual_conv3d.py +195 -0
vae (1)/pixel_norm.py +12 -0
vae (2)/model.py +711 -0
vae.py +131 -0
vae/put_vae_here +0 -0

GraphView-CUSGEqGS.js ADDED Viewed

The diff for this file is too large to render. See raw diff

causal_conv3d.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+class CausalConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size: int = 3,
+        stride: Union[int, Tuple[int]] = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        **kwargs,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        kernel_size = (kernel_size, kernel_size, kernel_size)
+        self.time_kernel_size = kernel_size[0]
+        dilation = (dilation, 1, 1)
+        height_pad = kernel_size[1] // 2
+        width_pad = kernel_size[2] // 2
+        padding = (0, height_pad, width_pad)
+        self.conv = ops.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            padding_mode="zeros",
+            groups=groups,
+        )
+    def forward(self, x, causal: bool = True):
+        if causal:
+            first_frame_pad = x[:, :, :1, :, :].repeat(
+                (1, 1, self.time_kernel_size - 1, 1, 1)
+            )
+            x = torch.concatenate((first_frame_pad, x), dim=2)
+        else:
+            first_frame_pad = x[:, :, :1, :, :].repeat(
+                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
+            )
+            last_frame_pad = x[:, :, -1:, :, :].repeat(
+                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
+            )
+            x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2)
+        x = self.conv(x)
+        return x
+    @property
+    def weight(self):
+        return self.conv.weight

causal_video_autoencoder.py ADDED Viewed

	@@ -0,0 +1,907 @@

+import torch
+from torch import nn
+from functools import partial
+import math
+from einops import rearrange
+from typing import Optional, Tuple, Union
+from .conv_nd_factory import make_conv_nd, make_linear_nd
+from .pixel_norm import PixelNorm
+from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+class Encoder(nn.Module):
+    r"""
+    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+    Args:
+        dims (`int` or `Tuple[int, int]`, *optional*, defaults to 3):
+            The number of dimensions to use in convolutions.
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        blocks (`List[Tuple[str, int]]`, *optional*, defaults to `[("res_x", 1)]`):
+            The blocks to use. Each block is a tuple of the block name and the number of layers.
+        base_channels (`int`, *optional*, defaults to 128):
+            The number of output channels for the first convolutional layer.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        patch_size (`int`, *optional*, defaults to 1):
+            The patch size to use. Should be a power of 2.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        latent_log_var (`str`, *optional*, defaults to `per_channel`):
+            The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]] = 3,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        blocks=[("res_x", 1)],
+        base_channels: int = 128,
+        norm_num_groups: int = 32,
+        patch_size: Union[int, Tuple[int]] = 1,
+        norm_layer: str = "group_norm",  # group_norm, pixel_norm
+        latent_log_var: str = "per_channel",
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.norm_layer = norm_layer
+        self.latent_channels = out_channels
+        self.latent_log_var = latent_log_var
+        self.blocks_desc = blocks
+        in_channels = in_channels * patch_size**2
+        output_channel = base_channels
+        self.conv_in = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=output_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
+        self.down_blocks = nn.ModuleList([])
+        for block_name, block_params in blocks:
+            input_channel = output_channel
+            if isinstance(block_params, int):
+                block_params = {"num_layers": block_params}
+            if block_name == "res_x":
+                block = UNetMidBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_eps=1e-6,
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                )
+            elif block_name == "res_x_y":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = ResnetBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    eps=1e-6,
+                    groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                )
+            elif block_name == "compress_time":
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 1, 1),
+                    causal=True,
+                )
+            elif block_name == "compress_space":
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(1, 2, 2),
+                    causal=True,
+                )
+            elif block_name == "compress_all":
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 2, 2),
+                    causal=True,
+                )
+            elif block_name == "compress_all_x_y":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 2, 2),
+                    causal=True,
+                )
+            else:
+                raise ValueError(f"unknown block: {block_name}")
+            self.down_blocks.append(block)
+        # out
+        if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
+            )
+        elif norm_layer == "pixel_norm":
+            self.conv_norm_out = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        conv_out_channels = out_channels
+        if latent_log_var == "per_channel":
+            conv_out_channels *= 2
+        elif latent_log_var == "uniform":
+            conv_out_channels += 1
+        elif latent_log_var != "none":
+            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
+        self.conv_out = make_conv_nd(
+            dims, output_channel, conv_out_channels, 3, padding=1, causal=True
+        )
+        self.gradient_checkpointing = False
+    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `Encoder` class."""
+        sample = patchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
+        sample = self.conv_in(sample)
+        checkpoint_fn = (
+            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+            if self.gradient_checkpointing and self.training
+            else lambda x: x
+        )
+        for down_block in self.down_blocks:
+            sample = checkpoint_fn(down_block)(sample)
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if self.latent_log_var == "uniform":
+            last_channel = sample[:, -1:, ...]
+            num_dims = sample.dim()
+            if num_dims == 4:
+                # For shape (B, C, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1
+                )
+                sample = torch.cat([sample, repeated_last_channel], dim=1)
+            elif num_dims == 5:
+                # For shape (B, C, F, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1, 1
+                )
+                sample = torch.cat([sample, repeated_last_channel], dim=1)
+            else:
+                raise ValueError(f"Invalid input shape: {sample.shape}")
+        return sample
+class Decoder(nn.Module):
+    r"""
+    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+    Args:
+        dims (`int` or `Tuple[int, int]`, *optional*, defaults to 3):
+            The number of dimensions to use in convolutions.
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        blocks (`List[Tuple[str, int]]`, *optional*, defaults to `[("res_x", 1)]`):
+            The blocks to use. Each block is a tuple of the block name and the number of layers.
+        base_channels (`int`, *optional*, defaults to 128):
+            The number of output channels for the first convolutional layer.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        patch_size (`int`, *optional*, defaults to 1):
+            The patch size to use. Should be a power of 2.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        causal (`bool`, *optional*, defaults to `True`):
+            Whether to use causal convolutions or not.
+    """
+    def __init__(
+        self,
+        dims,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        blocks=[("res_x", 1)],
+        base_channels: int = 128,
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        patch_size: int = 1,
+        norm_layer: str = "group_norm",
+        causal: bool = True,
+        timestep_conditioning: bool = False,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.layers_per_block = layers_per_block
+        out_channels = out_channels * patch_size**2
+        self.causal = causal
+        self.blocks_desc = blocks
+        # Compute output channel to be product of all channel-multiplier blocks
+        output_channel = base_channels
+        for block_name, block_params in list(reversed(blocks)):
+            block_params = block_params if isinstance(block_params, dict) else {}
+            if block_name == "res_x_y":
+                output_channel = output_channel * block_params.get("multiplier", 2)
+            if block_name == "compress_all":
+                output_channel = output_channel * block_params.get("multiplier", 1)
+        self.conv_in = make_conv_nd(
+            dims,
+            in_channels,
+            output_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
+        self.up_blocks = nn.ModuleList([])
+        for block_name, block_params in list(reversed(blocks)):
+            input_channel = output_channel
+            if isinstance(block_params, int):
+                block_params = {"num_layers": block_params}
+            if block_name == "res_x":
+                block = UNetMidBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_eps=1e-6,
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=timestep_conditioning,
+                )
+            elif block_name == "attn_res_x":
+                block = UNetMidBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=timestep_conditioning,
+                    attention_head_dim=block_params["attention_head_dim"],
+                )
+            elif block_name == "res_x_y":
+                output_channel = output_channel // block_params.get("multiplier", 2)
+                block = ResnetBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    eps=1e-6,
+                    groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=False,
+                )
+            elif block_name == "compress_time":
+                block = DepthToSpaceUpsample(
+                    dims=dims, in_channels=input_channel, stride=(2, 1, 1)
+                )
+            elif block_name == "compress_space":
+                block = DepthToSpaceUpsample(
+                    dims=dims, in_channels=input_channel, stride=(1, 2, 2)
+                )
+            elif block_name == "compress_all":
+                output_channel = output_channel // block_params.get("multiplier", 1)
+                block = DepthToSpaceUpsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    stride=(2, 2, 2),
+                    residual=block_params.get("residual", False),
+                    out_channels_reduction_factor=block_params.get("multiplier", 1),
+                )
+            else:
+                raise ValueError(f"unknown layer: {block_name}")
+            self.up_blocks.append(block)
+        if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
+            )
+        elif norm_layer == "pixel_norm":
+            self.conv_norm_out = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = make_conv_nd(
+            dims, output_channel, out_channels, 3, padding=1, causal=True
+        )
+        self.gradient_checkpointing = False
+        self.timestep_conditioning = timestep_conditioning
+        if timestep_conditioning:
+            self.timestep_scale_multiplier = nn.Parameter(
+                torch.tensor(1000.0, dtype=torch.float32)
+            )
+            self.last_time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(
+                output_channel * 2, 0, operations=ops,
+            )
+            self.last_scale_shift_table = nn.Parameter(torch.empty(2, output_channel))
+    # def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor:
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `Decoder` class."""
+        batch_size = sample.shape[0]
+        sample = self.conv_in(sample, causal=self.causal)
+        checkpoint_fn = (
+            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+            if self.gradient_checkpointing and self.training
+            else lambda x: x
+        )
+        scaled_timestep = None
+        if self.timestep_conditioning:
+            assert (
+                timestep is not None
+            ), "should pass timestep with timestep_conditioning=True"
+            scaled_timestep = timestep * self.timestep_scale_multiplier.to(dtype=sample.dtype, device=sample.device)
+        for up_block in self.up_blocks:
+            if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D):
+                sample = checkpoint_fn(up_block)(
+                    sample, causal=self.causal, timestep=scaled_timestep
+                )
+            else:
+                sample = checkpoint_fn(up_block)(sample, causal=self.causal)
+        sample = self.conv_norm_out(sample)
+        if self.timestep_conditioning:
+            embedded_timestep = self.last_time_embedder(
+                timestep=scaled_timestep.flatten(),
+                resolution=None,
+                aspect_ratio=None,
+                batch_size=sample.shape[0],
+                hidden_dtype=sample.dtype,
+            )
+            embedded_timestep = embedded_timestep.view(
+                batch_size, embedded_timestep.shape[-1], 1, 1, 1
+            )
+            ada_values = self.last_scale_shift_table[
+                None, ..., None, None, None
+            ].to(device=sample.device, dtype=sample.dtype) + embedded_timestep.reshape(
+                batch_size,
+                2,
+                -1,
+                embedded_timestep.shape[-3],
+                embedded_timestep.shape[-2],
+                embedded_timestep.shape[-1],
+            )
+            shift, scale = ada_values.unbind(dim=1)
+            sample = sample * (1 + scale) + shift
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample, causal=self.causal)
+        sample = unpatchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
+        return sample
+class UNetMidBlock3D(nn.Module):
+    """
+    A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.
+    Args:
+        in_channels (`int`): The number of input channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_groups: int = 32,
+        norm_layer: str = "group_norm",
+        inject_noise: bool = False,
+        timestep_conditioning: bool = False,
+    ):
+        super().__init__()
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+        self.timestep_conditioning = timestep_conditioning
+        if timestep_conditioning:
+            self.time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(
+                in_channels * 4, 0, operations=ops,
+            )
+        self.res_blocks = nn.ModuleList(
+            [
+                ResnetBlock3D(
+                    dims=dims,
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    norm_layer=norm_layer,
+                    inject_noise=inject_noise,
+                    timestep_conditioning=timestep_conditioning,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    def forward(
+        self, hidden_states: torch.FloatTensor, causal: bool = True, timestep: Optional[torch.Tensor] = None
+    ) -> torch.FloatTensor:
+        timestep_embed = None
+        if self.timestep_conditioning:
+            assert (
+                timestep is not None
+            ), "should pass timestep with timestep_conditioning=True"
+            batch_size = hidden_states.shape[0]
+            timestep_embed = self.time_embedder(
+                timestep=timestep.flatten(),
+                resolution=None,
+                aspect_ratio=None,
+                batch_size=batch_size,
+                hidden_dtype=hidden_states.dtype,
+            )
+            timestep_embed = timestep_embed.view(
+                batch_size, timestep_embed.shape[-1], 1, 1, 1
+            )
+        for resnet in self.res_blocks:
+            hidden_states = resnet(hidden_states, causal=causal, timestep=timestep_embed)
+        return hidden_states
+class DepthToSpaceUpsample(nn.Module):
+    def __init__(
+        self, dims, in_channels, stride, residual=False, out_channels_reduction_factor=1
+    ):
+        super().__init__()
+        self.stride = stride
+        self.out_channels = (
+            math.prod(stride) * in_channels // out_channels_reduction_factor
+        )
+        self.conv = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            causal=True,
+        )
+        self.residual = residual
+        self.out_channels_reduction_factor = out_channels_reduction_factor
+    def forward(self, x, causal: bool = True, timestep: Optional[torch.Tensor] = None):
+        if self.residual:
+            # Reshape and duplicate the input to match the output shape
+            x_in = rearrange(
+                x,
+                "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+                p1=self.stride[0],
+                p2=self.stride[1],
+                p3=self.stride[2],
+            )
+            num_repeat = math.prod(self.stride) // self.out_channels_reduction_factor
+            x_in = x_in.repeat(1, num_repeat, 1, 1, 1)
+            if self.stride[0] == 2:
+                x_in = x_in[:, :, 1:, :, :]
+        x = self.conv(x, causal=causal)
+        x = rearrange(
+            x,
+            "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+            p1=self.stride[0],
+            p2=self.stride[1],
+            p3=self.stride[2],
+        )
+        if self.stride[0] == 2:
+            x = x[:, :, 1:, :, :]
+        if self.residual:
+            x = x + x_in
+        return x
+class LayerNorm(nn.Module):
+    def __init__(self, dim, eps, elementwise_affine=True) -> None:
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
+    def forward(self, x):
+        x = rearrange(x, "b c d h w -> b d h w c")
+        x = self.norm(x)
+        x = rearrange(x, "b d h w c -> b c d h w")
+        return x
+class ResnetBlock3D(nn.Module):
+    r"""
+    A Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        dropout: float = 0.0,
+        groups: int = 32,
+        eps: float = 1e-6,
+        norm_layer: str = "group_norm",
+        inject_noise: bool = False,
+        timestep_conditioning: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.inject_noise = inject_noise
+        if norm_layer == "group_norm":
+            self.norm1 = nn.GroupNorm(
+                num_groups=groups, num_channels=in_channels, eps=eps, affine=True
+            )
+        elif norm_layer == "pixel_norm":
+            self.norm1 = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.norm1 = LayerNorm(in_channels, eps=eps, elementwise_affine=True)
+        self.non_linearity = nn.SiLU()
+        self.conv1 = make_conv_nd(
+            dims,
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
+        if inject_noise:
+            self.per_channel_scale1 = nn.Parameter(torch.zeros((in_channels, 1, 1)))
+        if norm_layer == "group_norm":
+            self.norm2 = nn.GroupNorm(
+                num_groups=groups, num_channels=out_channels, eps=eps, affine=True
+            )
+        elif norm_layer == "pixel_norm":
+            self.norm2 = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.norm2 = LayerNorm(out_channels, eps=eps, elementwise_affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = make_conv_nd(
+            dims,
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
+        if inject_noise:
+            self.per_channel_scale2 = nn.Parameter(torch.zeros((in_channels, 1, 1)))
+        self.conv_shortcut = (
+            make_linear_nd(
+                dims=dims, in_channels=in_channels, out_channels=out_channels
+            )
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+        self.norm3 = (
+            LayerNorm(in_channels, eps=eps, elementwise_affine=True)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+        self.timestep_conditioning = timestep_conditioning
+        if timestep_conditioning:
+            self.scale_shift_table = nn.Parameter(
+                torch.randn(4, in_channels) / in_channels**0.5
+            )
+    def _feed_spatial_noise(
+        self, hidden_states: torch.FloatTensor, per_channel_scale: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        spatial_shape = hidden_states.shape[-2:]
+        device = hidden_states.device
+        dtype = hidden_states.dtype
+        # similar to the "explicit noise inputs" method in style-gan
+        spatial_noise = torch.randn(spatial_shape, device=device, dtype=dtype)[None]
+        scaled_noise = (spatial_noise * per_channel_scale)[None, :, None, ...]
+        hidden_states = hidden_states + scaled_noise
+        return hidden_states
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        causal: bool = True,
+        timestep: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+        batch_size = hidden_states.shape[0]
+        hidden_states = self.norm1(hidden_states)
+        if self.timestep_conditioning:
+            assert (
+                timestep is not None
+            ), "should pass timestep with timestep_conditioning=True"
+            ada_values = self.scale_shift_table[
+                None, ..., None, None, None
+            ].to(device=hidden_states.device, dtype=hidden_states.dtype) + timestep.reshape(
+                batch_size,
+                4,
+                -1,
+                timestep.shape[-3],
+                timestep.shape[-2],
+                timestep.shape[-1],
+            )
+            shift1, scale1, shift2, scale2 = ada_values.unbind(dim=1)
+            hidden_states = hidden_states * (1 + scale1) + shift1
+        hidden_states = self.non_linearity(hidden_states)
+        hidden_states = self.conv1(hidden_states, causal=causal)
+        if self.inject_noise:
+            hidden_states = self._feed_spatial_noise(
+                hidden_states, self.per_channel_scale1.to(device=hidden_states.device, dtype=hidden_states.dtype)
+            )
+        hidden_states = self.norm2(hidden_states)
+        if self.timestep_conditioning:
+            hidden_states = hidden_states * (1 + scale2) + shift2
+        hidden_states = self.non_linearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states, causal=causal)
+        if self.inject_noise:
+            hidden_states = self._feed_spatial_noise(
+                hidden_states, self.per_channel_scale2.to(device=hidden_states.device, dtype=hidden_states.dtype)
+            )
+        input_tensor = self.norm3(input_tensor)
+        batch_size = input_tensor.shape[0]
+        input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = input_tensor + hidden_states
+        return output_tensor
+def patchify(x, patch_size_hw, patch_size_t=1):
+    if patch_size_hw == 1 and patch_size_t == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw
+        )
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b c (f p) (h q) (w r) -> b (c p r q) f h w",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    return x
+def unpatchify(x, patch_size_hw, patch_size_t=1):
+    if patch_size_hw == 1 and patch_size_t == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw
+        )
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b (c p r q) f h w -> b c (f p) (h q) (w r)",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
+    return x
+class processor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.register_buffer("std-of-means", torch.empty(128))
+        self.register_buffer("mean-of-means", torch.empty(128))
+        self.register_buffer("mean-of-stds", torch.empty(128))
+        self.register_buffer("mean-of-stds_over_std-of-means", torch.empty(128))
+        self.register_buffer("channel", torch.empty(128))
+    def un_normalize(self, x):
+        return (x * self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)) + self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)
+    def normalize(self, x):
+        return (x - self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)) / self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)
+class VideoVAE(nn.Module):
+    def __init__(self, version=0):
+        super().__init__()
+        if version == 0:
+            config = {
+                "_class_name": "CausalVideoAutoencoder",
+                "dims": 3,
+                "in_channels": 3,
+                "out_channels": 3,
+                "latent_channels": 128,
+                "blocks": [
+                    ["res_x", 4],
+                    ["compress_all", 1],
+                    ["res_x_y", 1],
+                    ["res_x", 3],
+                    ["compress_all", 1],
+                    ["res_x_y", 1],
+                    ["res_x", 3],
+                    ["compress_all", 1],
+                    ["res_x", 3],
+                    ["res_x", 4],
+                ],
+                "scaling_factor": 1.0,
+                "norm_layer": "pixel_norm",
+                "patch_size": 4,
+                "latent_log_var": "uniform",
+                "use_quant_conv": False,
+                "causal_decoder": False,
+            }
+        else:
+            config = {
+                "_class_name": "CausalVideoAutoencoder",
+                "dims": 3,
+                "in_channels": 3,
+                "out_channels": 3,
+                "latent_channels": 128,
+                "decoder_blocks": [
+                    ["res_x", {"num_layers": 5, "inject_noise": True}],
+                    ["compress_all", {"residual": True, "multiplier": 2}],
+                    ["res_x", {"num_layers": 6, "inject_noise": True}],
+                    ["compress_all", {"residual": True, "multiplier": 2}],
+                    ["res_x", {"num_layers": 7, "inject_noise": True}],
+                    ["compress_all", {"residual": True, "multiplier": 2}],
+                    ["res_x", {"num_layers": 8, "inject_noise": False}]
+                ],
+                "encoder_blocks": [
+                    ["res_x", {"num_layers": 4}],
+                    ["compress_all", {}],
+                    ["res_x_y", 1],
+                    ["res_x", {"num_layers": 3}],
+                    ["compress_all", {}],
+                    ["res_x_y", 1],
+                    ["res_x", {"num_layers": 3}],
+                    ["compress_all", {}],
+                    ["res_x", {"num_layers": 3}],
+                    ["res_x", {"num_layers": 4}]
+                ],
+                "scaling_factor": 1.0,
+                "norm_layer": "pixel_norm",
+                "patch_size": 4,
+                "latent_log_var": "uniform",
+                "use_quant_conv": False,
+                "causal_decoder": False,
+                "timestep_conditioning": True,
+            }
+        double_z = config.get("double_z", True)
+        latent_log_var = config.get(
+            "latent_log_var", "per_channel" if double_z else "none"
+        )
+        self.encoder = Encoder(
+            dims=config["dims"],
+            in_channels=config.get("in_channels", 3),
+            out_channels=config["latent_channels"],
+            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
+            patch_size=config.get("patch_size", 1),
+            latent_log_var=latent_log_var,
+            norm_layer=config.get("norm_layer", "group_norm"),
+        )
+        self.decoder = Decoder(
+            dims=config["dims"],
+            in_channels=config["latent_channels"],
+            out_channels=config.get("out_channels", 3),
+            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
+            patch_size=config.get("patch_size", 1),
+            norm_layer=config.get("norm_layer", "group_norm"),
+            causal=config.get("causal_decoder", False),
+            timestep_conditioning=config.get("timestep_conditioning", False),
+        )
+        self.timestep_conditioning = config.get("timestep_conditioning", False)
+        self.per_channel_statistics = processor()
+    def encode(self, x):
+        means, logvar = torch.chunk(self.encoder(x), 2, dim=1)
+        return self.per_channel_statistics.normalize(means)
+    def decode(self, x, timestep=0.05, noise_scale=0.025):
+        if self.timestep_conditioning: #TODO: seed
+            x = torch.randn_like(x) * noise_scale + (1.0 - noise_scale) * x
+        return self.decoder(self.per_channel_statistics.un_normalize(x), timestep=timestep)

conv_nd_factory.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from typing import Tuple, Union
+from .dual_conv3d import DualConv3d
+from .causal_conv3d import CausalConv3d
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+def make_conv_nd(
+    dims: Union[int, Tuple[int, int]],
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    bias=True,
+    causal=False,
+):
+    if dims == 2:
+        return ops.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+    elif dims == 3:
+        if causal:
+            return CausalConv3d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+            )
+        return ops.Conv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+    elif dims == (2, 1):
+        return DualConv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+    else:
+        raise ValueError(f"unsupported dimensions: {dims}")
+def make_linear_nd(
+    dims: int,
+    in_channels: int,
+    out_channels: int,
+    bias=True,
+):
+    if dims == 2:
+        return ops.Conv2d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias
+        )
+    elif dims == 3 or dims == (2, 1):
+        return ops.Conv3d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias
+        )
+    else:
+        raise ValueError(f"unsupported dimensions: {dims}")

dual_conv3d.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import math
+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+class DualConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        groups=1,
+        bias=True,
+    ):
+        super(DualConv3d, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        # Ensure kernel_size, stride, padding, and dilation are tuples of length 3
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size, kernel_size)
+        if kernel_size == (1, 1, 1):
+            raise ValueError(
+                "kernel_size must be greater than 1. Use make_linear_nd instead."
+            )
+        if isinstance(stride, int):
+            stride = (stride, stride, stride)
+        if isinstance(padding, int):
+            padding = (padding, padding, padding)
+        if isinstance(dilation, int):
+            dilation = (dilation, dilation, dilation)
+        # Set parameters for convolutions
+        self.groups = groups
+        self.bias = bias
+        # Define the size of the channels after the first convolution
+        intermediate_channels = (
+            out_channels if in_channels < out_channels else in_channels
+        )
+        # Define parameters for the first convolution
+        self.weight1 = nn.Parameter(
+            torch.Tensor(
+                intermediate_channels,
+                in_channels // groups,
+                1,
+                kernel_size[1],
+                kernel_size[2],
+            )
+        )
+        self.stride1 = (1, stride[1], stride[2])
+        self.padding1 = (0, padding[1], padding[2])
+        self.dilation1 = (1, dilation[1], dilation[2])
+        if bias:
+            self.bias1 = nn.Parameter(torch.Tensor(intermediate_channels))
+        else:
+            self.register_parameter("bias1", None)
+        # Define parameters for the second convolution
+        self.weight2 = nn.Parameter(
+            torch.Tensor(
+                out_channels, intermediate_channels // groups, kernel_size[0], 1, 1
+            )
+        )
+        self.stride2 = (stride[0], 1, 1)
+        self.padding2 = (padding[0], 0, 0)
+        self.dilation2 = (dilation[0], 1, 1)
+        if bias:
+            self.bias2 = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter("bias2", None)
+        # Initialize weights and biases
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.kaiming_uniform_(self.weight1, a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.weight2, a=math.sqrt(5))
+        if self.bias:
+            fan_in1, _ = nn.init._calculate_fan_in_and_fan_out(self.weight1)
+            bound1 = 1 / math.sqrt(fan_in1)
+            nn.init.uniform_(self.bias1, -bound1, bound1)
+            fan_in2, _ = nn.init._calculate_fan_in_and_fan_out(self.weight2)
+            bound2 = 1 / math.sqrt(fan_in2)
+            nn.init.uniform_(self.bias2, -bound2, bound2)
+    def forward(self, x, use_conv3d=False, skip_time_conv=False):
+        if use_conv3d:
+            return self.forward_with_3d(x=x, skip_time_conv=skip_time_conv)
+        else:
+            return self.forward_with_2d(x=x, skip_time_conv=skip_time_conv)
+    def forward_with_3d(self, x, skip_time_conv):
+        # First convolution
+        x = F.conv3d(
+            x,
+            self.weight1,
+            self.bias1,
+            self.stride1,
+            self.padding1,
+            self.dilation1,
+            self.groups,
+        )
+        if skip_time_conv:
+            return x
+        # Second convolution
+        x = F.conv3d(
+            x,
+            self.weight2,
+            self.bias2,
+            self.stride2,
+            self.padding2,
+            self.dilation2,
+            self.groups,
+        )
+        return x
+    def forward_with_2d(self, x, skip_time_conv):
+        b, c, d, h, w = x.shape
+        # First 2D convolution
+        x = rearrange(x, "b c d h w -> (b d) c h w")
+        # Squeeze the depth dimension out of weight1 since it's 1
+        weight1 = self.weight1.squeeze(2)
+        # Select stride, padding, and dilation for the 2D convolution
+        stride1 = (self.stride1[1], self.stride1[2])
+        padding1 = (self.padding1[1], self.padding1[2])
+        dilation1 = (self.dilation1[1], self.dilation1[2])
+        x = F.conv2d(x, weight1, self.bias1, stride1, padding1, dilation1, self.groups)
+        _, _, h, w = x.shape
+        if skip_time_conv:
+            x = rearrange(x, "(b d) c h w -> b c d h w", b=b)
+            return x
+        # Second convolution which is essentially treated as a 1D convolution across the 'd' dimension
+        x = rearrange(x, "(b d) c h w -> (b h w) c d", b=b)
+        # Reshape weight2 to match the expected dimensions for conv1d
+        weight2 = self.weight2.squeeze(-1).squeeze(-1)
+        # Use only the relevant dimension for stride, padding, and dilation for the 1D convolution
+        stride2 = self.stride2[0]
+        padding2 = self.padding2[0]
+        dilation2 = self.dilation2[0]
+        x = F.conv1d(x, weight2, self.bias2, stride2, padding2, dilation2, self.groups)
+        x = rearrange(x, "(b h w) c d -> b c d h w", b=b, h=h, w=w)
+        return x
+    @property
+    def weight(self):
+        return self.weight2
+def test_dual_conv3d_consistency():
+    # Initialize parameters
+    in_channels = 3
+    out_channels = 5
+    kernel_size = (3, 3, 3)
+    stride = (2, 2, 2)
+    padding = (1, 1, 1)
+    # Create an instance of the DualConv3d class
+    dual_conv3d = DualConv3d(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        bias=True,
+    )
+    # Example input tensor
+    test_input = torch.randn(1, 3, 10, 10, 10)
+    # Perform forward passes with both 3D and 2D settings
+    output_conv3d = dual_conv3d(test_input, use_conv3d=True)
+    output_2d = dual_conv3d(test_input, use_conv3d=False)
+    # Assert that the outputs from both methods are sufficiently close
+    assert torch.allclose(
+        output_conv3d, output_2d, atol=1e-6
+    ), "Outputs are not consistent between 3D and 2D convolutions."

index-4Hb32CNk.js ADDED Viewed

The diff for this file is too large to render. See raw diff

index-C1Hb_Yo9.css ADDED Viewed

	@@ -0,0 +1,5129 @@

+/* this CSS contains only the basic CSS needed to run the app and use it */
+.lgraphcanvas {
+  /*cursor: crosshair;*/
+  user-select: none;
+  -moz-user-select: none;
+  -webkit-user-select: none;
+  outline: none;
+  font-family: Tahoma, sans-serif;
+}
+.lgraphcanvas * {
+  box-sizing: border-box;
+}
+.litegraph.litecontextmenu {
+  font-family: Tahoma, sans-serif;
+  position: fixed;
+  top: 100px;
+  left: 100px;
+  min-width: 100px;
+  color: #aaf;
+  padding: 0;
+  box-shadow: 0 0 10px black !important;
+  background-color: #2e2e2e !important;
+  z-index: 10;
+  max-height: -webkit-fill-available;
+  overflow-y: auto;
+}
+/* Enable scrolling overflow in Firefox */
+@supports not (max-height: -webkit-fill-available) {
+  .litegraph.litecontextmenu {
+    max-height: 80vh;
+    overflow-y: scroll;
+  }
+}
+.litegraph.litecontextmenu.dark {
+  background-color: #000 !important;
+}
+.litegraph.litecontextmenu .litemenu-title img {
+  margin-top: 2px;
+  margin-left: 2px;
+  margin-right: 4px;
+}
+.litegraph.litecontextmenu .litemenu-entry {
+  margin: 2px;
+  padding: 2px;
+}
+.litegraph.litecontextmenu .litemenu-entry.submenu {
+  background-color: #2e2e2e !important;
+}
+.litegraph.litecontextmenu.dark .litemenu-entry.submenu {
+  background-color: #000 !important;
+}
+.litegraph .litemenubar ul {
+  font-family: Tahoma, sans-serif;
+  margin: 0;
+  padding: 0;
+}
+.litegraph .litemenubar li {
+  font-size: 14px;
+  color: #999;
+  display: inline-block;
+  min-width: 50px;
+  padding-left: 10px;
+  padding-right: 10px;
+  user-select: none;
+  -moz-user-select: none;
+  -webkit-user-select: none;
+  cursor: pointer;
+}
+.litegraph .litemenubar li:hover {
+  background-color: #777;
+  color: #eee;
+}
+.litegraph .litegraph .litemenubar-panel {
+  position: absolute;
+  top: 5px;
+  left: 5px;
+  min-width: 100px;
+  background-color: #444;
+  box-shadow: 0 0 3px black;
+  padding: 4px;
+  border-bottom: 2px solid #aaf;
+  z-index: 10;
+}
+.litegraph .litemenu-entry,
+.litemenu-title {
+  font-size: 12px;
+  color: #aaa;
+  padding: 0 0 0 4px;
+  margin: 2px;
+  padding-left: 2px;
+  -moz-user-select: none;
+  -webkit-user-select: none;
+  user-select: none;
+  cursor: pointer;
+}
+.litegraph .litemenu-entry .icon {
+  display: inline-block;
+  width: 12px;
+  height: 12px;
+  margin: 2px;
+  vertical-align: top;
+}
+.litegraph .litemenu-entry.checked .icon {
+  background-color: #aaf;
+}
+.litegraph .litemenu-entry .more {
+  float: right;
+  padding-right: 5px;
+}
+.litegraph .litemenu-entry.disabled {
+  opacity: 0.5;
+  cursor: default;
+}
+.litegraph .litemenu-entry.separator {
+  display: block;
+  border-top: 1px solid #333;
+  border-bottom: 1px solid #666;
+  width: 100%;
+  height: 0px;
+  margin: 3px 0 2px 0;
+  background-color: transparent;
+  padding: 0 !important;
+  cursor: default !important;
+}
+.litegraph .litemenu-entry.has_submenu {
+  border-right: 2px solid cyan;
+}
+.litegraph .litemenu-title {
+  color: #dde;
+  background-color: #111;
+  margin: 0;
+  padding: 2px;
+  cursor: default;
+}
+.litegraph .litemenu-entry:hover:not(.disabled):not(.separator) {
+  background-color: #444 !important;
+  color: #eee;
+  transition: all 0.2s;
+}
+.litegraph .litemenu-entry .property_name {
+  display: inline-block;
+  text-align: left;
+  min-width: 80px;
+  min-height: 1.2em;
+}
+.litegraph .litemenu-entry .property_value {
+  display: inline-block;
+  background-color: rgba(0, 0, 0, 0.5);
+  text-align: right;
+  min-width: 80px;
+  min-height: 1.2em;
+  vertical-align: middle;
+  padding-right: 10px;
+}
+.litegraph.litesearchbox {
+  font-family: Tahoma, sans-serif;
+  position: absolute;
+  background-color: rgba(0, 0, 0, 0.5);
+  padding-top: 4px;
+}
+.litegraph.litesearchbox input,
+.litegraph.litesearchbox select {
+  margin-top: 3px;
+  min-width: 60px;
+  min-height: 1.5em;
+  background-color: black;
+  border: 0;
+  color: white;
+  padding-left: 10px;
+  margin-right: 5px;
+  max-width: 300px;
+}
+.litegraph.litesearchbox .name {
+  display: inline-block;
+  min-width: 60px;
+  min-height: 1.5em;
+  padding-left: 10px;
+}
+.litegraph.litesearchbox .helper {
+  overflow: auto;
+  max-height: 200px;
+  margin-top: 2px;
+}
+.litegraph.lite-search-item {
+  font-family: Tahoma, sans-serif;
+  background-color: rgba(0, 0, 0, 0.5);
+  color: white;
+  padding-top: 2px;
+}
+.litegraph.lite-search-item.not_in_filter {
+  /*background-color: rgba(50, 50, 50, 0.5);*/
+  /*color: #999;*/
+  color: #b99;
+  font-style: italic;
+}
+.litegraph.lite-search-item.generic_type {
+  /*background-color: rgba(50, 50, 50, 0.5);*/
+  /*color: #DD9;*/
+  color: #999;
+  font-style: italic;
+}
+.litegraph.lite-search-item:hover,
+.litegraph.lite-search-item.selected {
+  cursor: pointer;
+  background-color: white;
+  color: black;
+}
+.litegraph.lite-search-item-type {
+  display: inline-block;
+  background: rgba(0, 0, 0, 0.2);
+  margin-left: 5px;
+  font-size: 14px;
+  padding: 2px 5px;
+  position: relative;
+  top: -2px;
+  opacity: 0.8;
+  border-radius: 4px;
+}
+/* DIALOGs ******/
+.litegraph .dialog {
+  position: absolute;
+  top: 50%;
+  left: 50%;
+  margin-top: -150px;
+  margin-left: -200px;
+  background-color: #2a2a2a;
+  min-width: 400px;
+  min-height: 200px;
+  box-shadow: 0 0 4px #111;
+  border-radius: 6px;
+}
+.litegraph .dialog.settings {
+  left: 10px;
+  top: 10px;
+  height: calc(100% - 20px);
+  margin: auto;
+  max-width: 50%;
+}
+.litegraph .dialog.centered {
+  top: 50px;
+  left: 50%;
+  position: absolute;
+  transform: translateX(-50%);
+  min-width: 600px;
+  min-height: 300px;
+  height: calc(100% - 100px);
+  margin: auto;
+}
+.litegraph .dialog .close {
+  float: right;
+  margin: 4px;
+  margin-right: 10px;
+  cursor: pointer;
+  font-size: 1.4em;
+}
+.litegraph .dialog .close:hover {
+  color: white;
+}
+.litegraph .dialog .dialog-header {
+  color: #aaa;
+  border-bottom: 1px solid #161616;
+  height: 40px;
+}
+.litegraph .dialog .dialog-footer {
+  height: 50px;
+  padding: 10px;
+  border-top: 1px solid #1a1a1a;
+}
+.litegraph .dialog .dialog-header .dialog-title {
+  font: 20px "Arial";
+  margin: 4px;
+  padding: 4px 10px;
+  display: inline-block;
+}
+.litegraph .dialog .dialog-content,
+.litegraph .dialog .dialog-alt-content {
+  height: calc(100% - 90px);
+  width: 100%;
+  min-height: 100px;
+  display: inline-block;
+  color: #aaa;
+  /*background-color: black;*/
+  overflow: auto;
+}
+.litegraph .dialog .dialog-content h3 {
+  margin: 10px;
+}
+.litegraph .dialog .dialog-content .connections {
+  flex-direction: row;
+}
+.litegraph .dialog .dialog-content .connections .connections_side {
+  width: calc(50% - 5px);
+  min-height: 100px;
+  background-color: black;
+  display: flex;
+}
+.litegraph .dialog .node_type {
+  font-size: 1.2em;
+  display: block;
+  margin: 10px;
+}
+.litegraph .dialog .node_desc {
+  opacity: 0.5;
+  display: block;
+  margin: 10px;
+}
+.litegraph .dialog .separator {
+  display: block;
+  width: calc(100% - 4px);
+  height: 1px;
+  border-top: 1px solid #000;
+  border-bottom: 1px solid #333;
+  margin: 10px 2px;
+  padding: 0;
+}
+.litegraph .dialog .property {
+  margin-bottom: 2px;
+  padding: 4px;
+}
+.litegraph .dialog .property:hover {
+  background: #545454;
+}
+.litegraph .dialog .property_name {
+  color: #737373;
+  display: inline-block;
+  text-align: left;
+  vertical-align: top;
+  width: 160px;
+  padding-left: 4px;
+  overflow: hidden;
+  margin-right: 6px;
+}
+.litegraph .dialog .property:hover .property_name {
+  color: white;
+}
+.litegraph .dialog .property_value {
+  display: inline-block;
+  text-align: right;
+  color: #aaa;
+  background-color: #1a1a1a;
+  /*width: calc( 100% - 122px );*/
+  max-width: calc(100% - 162px);
+  min-width: 200px;
+  max-height: 300px;
+  min-height: 20px;
+  padding: 4px;
+  padding-right: 12px;
+  overflow: hidden;
+  cursor: pointer;
+  border-radius: 3px;
+}
+.litegraph .dialog .property_value:hover {
+  color: white;
+}
+.litegraph .dialog .property.boolean .property_value {
+  padding-right: 30px;
+  color: #a88;
+  /*width: auto;
+    float: right;*/
+}
+.litegraph .dialog .property.boolean.bool-on .property_name {
+  color: #8a8;
+}
+.litegraph .dialog .property.boolean.bool-on .property_value {
+  color: #8a8;
+}
+.litegraph .dialog .btn {
+  border: 0;
+  border-radius: 4px;
+  padding: 4px 20px;
+  margin-left: 0px;
+  background-color: #060606;
+  color: #8e8e8e;
+}
+.litegraph .dialog .btn:hover {
+  background-color: #111;
+  color: #fff;
+}
+.litegraph .dialog .btn.delete:hover {
+  background-color: #f33;
+  color: black;
+}
+.litegraph .subgraph_property {
+  padding: 4px;
+}
+.litegraph .subgraph_property:hover {
+  background-color: #333;
+}
+.litegraph .subgraph_property.extra {
+  margin-top: 8px;
+}
+.litegraph .subgraph_property span.name {
+  font-size: 1.3em;
+  padding-left: 4px;
+}
+.litegraph .subgraph_property span.type {
+  opacity: 0.5;
+  margin-right: 20px;
+  padding-left: 4px;
+}
+.litegraph .subgraph_property span.label {
+  display: inline-block;
+  width: 60px;
+  padding: 0px 10px;
+}
+.litegraph .subgraph_property input {
+  width: 140px;
+  color: #999;
+  background-color: #1a1a1a;
+  border-radius: 4px;
+  border: 0;
+  margin-right: 10px;
+  padding: 4px;
+  padding-left: 10px;
+}
+.litegraph .subgraph_property button {
+  background-color: #1c1c1c;
+  color: #aaa;
+  border: 0;
+  border-radius: 2px;
+  padding: 4px 10px;
+  cursor: pointer;
+}
+.litegraph .subgraph_property.extra {
+  color: #ccc;
+}
+.litegraph .subgraph_property.extra input {
+  background-color: #111;
+}
+.litegraph .bullet_icon {
+  margin-left: 10px;
+  border-radius: 10px;
+  width: 12px;
+  height: 12px;
+  background-color: #666;
+  display: inline-block;
+  margin-top: 2px;
+  margin-right: 4px;
+  transition: background-color 0.1s ease 0s;
+  -moz-transition: background-color 0.1s ease 0s;
+}
+.litegraph .bullet_icon:hover {
+  background-color: #698;
+  cursor: pointer;
+}
+/* OLD */
+.graphcontextmenu {
+  padding: 4px;
+  min-width: 100px;
+}
+.graphcontextmenu-title {
+  color: #dde;
+  background-color: #222;
+  margin: 0;
+  padding: 2px;
+  cursor: default;
+}
+.graphmenu-entry {
+  box-sizing: border-box;
+  margin: 2px;
+  padding-left: 20px;
+  user-select: none;
+  -moz-user-select: none;
+  -webkit-user-select: none;
+  transition: all linear 0.3s;
+}
+.graphmenu-entry.event,
+.litemenu-entry.event {
+  border-left: 8px solid orange;
+  padding-left: 12px;
+}
+.graphmenu-entry.disabled {
+  opacity: 0.3;
+}
+.graphmenu-entry.submenu {
+  border-right: 2px solid #eee;
+}
+.graphmenu-entry:hover {
+  background-color: #555;
+}
+.graphmenu-entry.separator {
+  background-color: #111;
+  border-bottom: 1px solid #666;
+  height: 1px;
+  width: calc(100% - 20px);
+  -moz-width: calc(100% - 20px);
+  -webkit-width: calc(100% - 20px);
+}
+.graphmenu-entry .property_name {
+  display: inline-block;
+  text-align: left;
+  min-width: 80px;
+  min-height: 1.2em;
+}
+.graphmenu-entry .property_value,
+.litemenu-entry .property_value {
+  display: inline-block;
+  background-color: rgba(0, 0, 0, 0.5);
+  text-align: right;
+  min-width: 80px;
+  min-height: 1.2em;
+  vertical-align: middle;
+  padding-right: 10px;
+}
+.graphdialog {
+  position: absolute;
+  top: 10px;
+  left: 10px;
+  min-height: 2em;
+  background-color: #333;
+  font-size: 1.2em;
+  box-shadow: 0 0 10px black !important;
+  z-index: 10;
+}
+.graphdialog.rounded {
+  border-radius: 12px;
+  padding-right: 2px;
+}
+.graphdialog .name {
+  display: inline-block;
+  min-width: 60px;
+  min-height: 1.5em;
+  padding-left: 10px;
+}
+.graphdialog input,
+.graphdialog textarea,
+.graphdialog select {
+  margin: 3px;
+  min-width: 60px;
+  min-height: 1.5em;
+  background-color: black;
+  border: 0;
+  color: white;
+  padding-left: 10px;
+  outline: none;
+}
+.graphdialog textarea {
+  min-height: 150px;
+}
+.graphdialog button {
+  margin-top: 3px;
+  vertical-align: top;
+  background-color: #999;
+  border: 0;
+}
+.graphdialog button.rounded,
+.graphdialog input.rounded {
+  border-radius: 0 12px 12px 0;
+}
+.graphdialog .helper {
+  overflow: auto;
+  max-height: 200px;
+}
+.graphdialog .help-item {
+  padding-left: 10px;
+}
+.graphdialog .help-item:hover,
+.graphdialog .help-item.selected {
+  cursor: pointer;
+  background-color: white;
+  color: black;
+}
+.litegraph .dialog {
+  min-height: 0;
+}
+.litegraph .dialog .dialog-content {
+  display: block;
+}
+.litegraph .dialog .dialog-content .subgraph_property {
+  padding: 5px;
+}
+.litegraph .dialog .dialog-footer {
+  margin: 0;
+}
+.litegraph .dialog .dialog-footer .subgraph_property {
+  margin-top: 0;
+  display: flex;
+  align-items: center;
+  padding: 5px;
+}
+.litegraph .dialog .dialog-footer .subgraph_property .name {
+  flex: 1;
+}
+.litegraph .graphdialog {
+  display: flex;
+  align-items: center;
+  border-radius: 20px;
+  padding: 4px 10px;
+  position: fixed;
+}
+.litegraph .graphdialog .name {
+  padding: 0;
+  min-height: 0;
+  font-size: 16px;
+  vertical-align: middle;
+}
+.litegraph .graphdialog .value {
+  font-size: 16px;
+  min-height: 0;
+  margin: 0 10px;
+  padding: 2px 5px;
+}
+.litegraph .graphdialog input[type="checkbox"] {
+  width: 16px;
+  height: 16px;
+}
+.litegraph .graphdialog button {
+  padding: 4px 18px;
+  border-radius: 20px;
+  cursor: pointer;
+}
+@font-face {
+    font-family: 'primeicons';
+    font-display: block;
+    src: url('./primeicons-DMOk5skT.eot');
+    src: url('./primeicons-DMOk5skT.eot?#iefix') format('embedded-opentype'), url('./primeicons-C6QP2o4f.woff2') format('woff2'), url('./primeicons-WjwUDZjB.woff') format('woff'), url('./primeicons-MpK4pl85.ttf') format('truetype'), url('./primeicons-Dr5RGzOO.svg?#primeicons') format('svg');
+    font-weight: normal;
+    font-style: normal;
+}
+.pi {
+    font-family: 'primeicons';
+    speak: none;
+    font-style: normal;
+    font-weight: normal;
+    font-variant: normal;
+    text-transform: none;
+    line-height: 1;
+    display: inline-block;
+    -webkit-font-smoothing: antialiased;
+    -moz-osx-font-smoothing: grayscale;
+}
+.pi:before {
+    --webkit-backface-visibility:hidden;
+    backface-visibility: hidden;
+}
+.pi-fw {
+    width: 1.28571429em;
+    text-align: center;
+}
+.pi-spin {
+    animation: fa-spin 2s infinite linear;
+}
+@media (prefers-reduced-motion: reduce) {
+  .pi-spin {
+    animation-delay: -1ms;
+    animation-duration: 1ms;
+    animation-iteration-count: 1;
+    transition-delay: 0s;
+    transition-duration: 0s;
+  }
+}
+@keyframes fa-spin {
+    0% {
+        transform: rotate(0deg);
+    }
+    100% {
+        transform: rotate(359deg);
+    }
+}
+.pi-folder-plus:before {
+    content: "\ea05";
+}
+.pi-receipt:before {
+    content: "\ea06";
+}
+.pi-asterisk:before {
+    content: "\ea07";
+}
+.pi-face-smile:before {
+    content: "\ea08";
+}
+.pi-pinterest:before {
+    content: "\ea09";
+}
+.pi-expand:before {
+    content: "\ea0a";
+}
+.pi-pen-to-square:before {
+    content: "\ea0b";
+}
+.pi-wave-pulse:before {
+    content: "\ea0c";
+}
+.pi-turkish-lira:before {
+    content: "\ea0d";
+}
+.pi-spinner-dotted:before {
+    content: "\ea0e";
+}
+.pi-crown:before {
+    content: "\ea0f";
+}
+.pi-pause-circle:before {
+    content: "\ea10";
+}
+.pi-warehouse:before {
+    content: "\ea11";
+}
+.pi-objects-column:before {
+    content: "\ea12";
+}
+.pi-clipboard:before {
+    content: "\ea13";
+}
+.pi-play-circle:before {
+    content: "\ea14";
+}
+.pi-venus:before {
+    content: "\ea15";
+}
+.pi-cart-minus:before {
+    content: "\ea16";
+}
+.pi-file-plus:before {
+    content: "\ea17";
+}
+.pi-microchip:before {
+    content: "\ea18";
+}
+.pi-twitch:before {
+    content: "\ea19";
+}
+.pi-building-columns:before {
+    content: "\ea1a";
+}
+.pi-file-check:before {
+    content: "\ea1b";
+}
+.pi-microchip-ai:before {
+    content: "\ea1c";
+}
+.pi-trophy:before {
+    content: "\ea1d";
+}
+.pi-barcode:before {
+    content: "\ea1e";
+}
+.pi-file-arrow-up:before {
+    content: "\ea1f";
+}
+.pi-mars:before {
+    content: "\ea20";
+}
+.pi-tiktok:before {
+    content: "\ea21";
+}
+.pi-arrow-up-right-and-arrow-down-left-from-center:before {
+    content: "\ea22";
+}
+.pi-ethereum:before {
+    content: "\ea23";
+}
+.pi-list-check:before {
+    content: "\ea24";
+}
+.pi-thumbtack:before {
+    content: "\ea25";
+}
+.pi-arrow-down-left-and-arrow-up-right-to-center:before {
+    content: "\ea26";
+}
+.pi-equals:before {
+    content: "\ea27";
+}
+.pi-lightbulb:before {
+    content: "\ea28";
+}
+.pi-star-half:before {
+    content: "\ea29";
+}
+.pi-address-book:before {
+    content: "\ea2a";
+}
+.pi-chart-scatter:before {
+    content: "\ea2b";
+}
+.pi-indian-rupee:before {
+    content: "\ea2c";
+}
+.pi-star-half-fill:before {
+    content: "\ea2d";
+}
+.pi-cart-arrow-down:before {
+    content: "\ea2e";
+}
+.pi-calendar-clock:before {
+    content: "\ea2f";
+}
+.pi-sort-up-fill:before {
+    content: "\ea30";
+}
+.pi-sparkles:before {
+    content: "\ea31";
+}
+.pi-bullseye:before {
+    content: "\ea32";
+}
+.pi-sort-down-fill:before {
+    content: "\ea33";
+}
+.pi-graduation-cap:before {
+    content: "\ea34";
+}
+.pi-hammer:before {
+    content: "\ea35";
+}
+.pi-bell-slash:before {
+    content: "\ea36";
+}
+.pi-gauge:before {
+    content: "\ea37";
+}
+.pi-shop:before {
+    content: "\ea38";
+}
+.pi-headphones:before {
+    content: "\ea39";
+}
+.pi-eraser:before {
+    content: "\ea04";
+}
+.pi-stopwatch:before {
+    content: "\ea01";
+}
+.pi-verified:before {
+    content: "\ea02";
+}
+.pi-delete-left:before {
+    content: "\ea03";
+}
+.pi-hourglass:before {
+    content: "\e9fe";
+}
+.pi-truck:before {
+    content: "\ea00";
+}
+.pi-wrench:before {
+    content: "\e9ff";
+}
+.pi-microphone:before {
+    content: "\e9fa";
+}
+.pi-megaphone:before {
+    content: "\e9fb";
+}
+.pi-arrow-right-arrow-left:before {
+    content: "\e9fc";
+}
+.pi-bitcoin:before {
+    content: "\e9fd";
+}
+.pi-file-edit:before {
+    content: "\e9f6";
+}
+.pi-language:before {
+    content: "\e9f7";
+}
+.pi-file-export:before {
+    content: "\e9f8";
+}
+.pi-file-import:before {
+    content: "\e9f9";
+}
+.pi-file-word:before {
+    content: "\e9f1";
+}
+.pi-gift:before {
+    content: "\e9f2";
+}
+.pi-cart-plus:before {
+    content: "\e9f3";
+}
+.pi-thumbs-down-fill:before {
+    content: "\e9f4";
+}
+.pi-thumbs-up-fill:before {
+    content: "\e9f5";
+}
+.pi-arrows-alt:before {
+    content: "\e9f0";
+}
+.pi-calculator:before {
+    content: "\e9ef";
+}
+.pi-sort-alt-slash:before {
+    content: "\e9ee";
+}
+.pi-arrows-h:before {
+    content: "\e9ec";
+}
+.pi-arrows-v:before {
+    content: "\e9ed";
+}
+.pi-pound:before {
+    content: "\e9eb";
+}
+.pi-prime:before {
+    content: "\e9ea";
+}
+.pi-chart-pie:before {
+    content: "\e9e9";
+}
+.pi-reddit:before {
+    content: "\e9e8";
+}
+.pi-code:before {
+    content: "\e9e7";
+}
+.pi-sync:before {
+    content: "\e9e6";
+}
+.pi-shopping-bag:before {
+    content: "\e9e5";
+}
+.pi-server:before {
+    content: "\e9e4";
+}
+.pi-database:before {
+    content: "\e9e3";
+}
+.pi-hashtag:before {
+    content: "\e9e2";
+}
+.pi-bookmark-fill:before {
+    content: "\e9df";
+}
+.pi-filter-fill:before {
+    content: "\e9e0";
+}
+.pi-heart-fill:before {
+    content: "\e9e1";
+}
+.pi-flag-fill:before {
+    content: "\e9de";
+}
+.pi-circle:before {
+    content: "\e9dc";
+}
+.pi-circle-fill:before {
+    content: "\e9dd";
+}
+.pi-bolt:before {
+    content: "\e9db";
+}
+.pi-history:before {
+    content: "\e9da";
+}
+.pi-box:before {
+    content: "\e9d9";
+}
+.pi-at:before {
+    content: "\e9d8";
+}
+.pi-arrow-up-right:before {
+    content: "\e9d4";
+}
+.pi-arrow-up-left:before {
+    content: "\e9d5";
+}
+.pi-arrow-down-left:before {
+    content: "\e9d6";
+}
+.pi-arrow-down-right:before {
+    content: "\e9d7";
+}
+.pi-telegram:before {
+    content: "\e9d3";
+}
+.pi-stop-circle:before {
+    content: "\e9d2";
+}
+.pi-stop:before {
+    content: "\e9d1";
+}
+.pi-whatsapp:before {
+    content: "\e9d0";
+}
+.pi-building:before {
+    content: "\e9cf";
+}
+.pi-qrcode:before {
+    content: "\e9ce";
+}
+.pi-car:before {
+    content: "\e9cd";
+}
+.pi-instagram:before {
+    content: "\e9cc";
+}
+.pi-linkedin:before {
+    content: "\e9cb";
+}
+.pi-send:before {
+    content: "\e9ca";
+}
+.pi-slack:before {
+    content: "\e9c9";
+}
+.pi-sun:before {
+    content: "\e9c8";
+}
+.pi-moon:before {
+    content: "\e9c7";
+}
+.pi-vimeo:before {
+    content: "\e9c6";
+}
+.pi-youtube:before {
+    content: "\e9c5";
+}
+.pi-flag:before {
+    content: "\e9c4";
+}
+.pi-wallet:before {
+    content: "\e9c3";
+}
+.pi-map:before {
+    content: "\e9c2";
+}
+.pi-link:before {
+    content: "\e9c1";
+}
+.pi-credit-card:before {
+    content: "\e9bf";
+}
+.pi-discord:before {
+    content: "\e9c0";
+}
+.pi-percentage:before {
+    content: "\e9be";
+}
+.pi-euro:before {
+    content: "\e9bd";
+}
+.pi-book:before {
+    content: "\e9ba";
+}
+.pi-shield:before {
+    content: "\e9b9";
+}
+.pi-paypal:before {
+    content: "\e9bb";
+}
+.pi-amazon:before {
+    content: "\e9bc";
+}
+.pi-phone:before {
+    content: "\e9b8";
+}
+.pi-filter-slash:before {
+    content: "\e9b7";
+}
+.pi-facebook:before {
+    content: "\e9b4";
+}
+.pi-github:before {
+    content: "\e9b5";
+}
+.pi-twitter:before {
+    content: "\e9b6";
+}
+.pi-step-backward-alt:before {
+    content: "\e9ac";
+}
+.pi-step-forward-alt:before {
+    content: "\e9ad";
+}
+.pi-forward:before {
+    content: "\e9ae";
+}
+.pi-backward:before {
+    content: "\e9af";
+}
+.pi-fast-backward:before {
+    content: "\e9b0";
+}
+.pi-fast-forward:before {
+    content: "\e9b1";
+}
+.pi-pause:before {
+    content: "\e9b2";
+}
+.pi-play:before {
+    content: "\e9b3";
+}
+.pi-compass:before {
+    content: "\e9ab";
+}
+.pi-id-card:before {
+    content: "\e9aa";
+}
+.pi-ticket:before {
+    content: "\e9a9";
+}
+.pi-file-o:before {
+    content: "\e9a8";
+}
+.pi-reply:before {
+    content: "\e9a7";
+}
+.pi-directions-alt:before {
+    content: "\e9a5";
+}
+.pi-directions:before {
+    content: "\e9a6";
+}
+.pi-thumbs-up:before {
+    content: "\e9a3";
+}
+.pi-thumbs-down:before {
+    content: "\e9a4";
+}
+.pi-sort-numeric-down-alt:before {
+    content: "\e996";
+}
+.pi-sort-numeric-up-alt:before {
+    content: "\e997";
+}
+.pi-sort-alpha-down-alt:before {
+    content: "\e998";
+}
+.pi-sort-alpha-up-alt:before {
+    content: "\e999";
+}
+.pi-sort-numeric-down:before {
+    content: "\e99a";
+}
+.pi-sort-numeric-up:before {
+    content: "\e99b";
+}
+.pi-sort-alpha-down:before {
+    content: "\e99c";
+}
+.pi-sort-alpha-up:before {
+    content: "\e99d";
+}
+.pi-sort-alt:before {
+    content: "\e99e";
+}
+.pi-sort-amount-up:before {
+    content: "\e99f";
+}
+.pi-sort-amount-down:before {
+    content: "\e9a0";
+}
+.pi-sort-amount-down-alt:before {
+    content: "\e9a1";
+}
+.pi-sort-amount-up-alt:before {
+    content: "\e9a2";
+}
+.pi-palette:before {
+    content: "\e995";
+}
+.pi-undo:before {
+    content: "\e994";
+}
+.pi-desktop:before {
+    content: "\e993";
+}
+.pi-sliders-v:before {
+    content: "\e991";
+}
+.pi-sliders-h:before {
+    content: "\e992";
+}
+.pi-search-plus:before {
+    content: "\e98f";
+}
+.pi-search-minus:before {
+    content: "\e990";
+}
+.pi-file-excel:before {
+    content: "\e98e";
+}
+.pi-file-pdf:before {
+    content: "\e98d";
+}
+.pi-check-square:before {
+    content: "\e98c";
+}
+.pi-chart-line:before {
+    content: "\e98b";
+}
+.pi-user-edit:before {
+    content: "\e98a";
+}
+.pi-exclamation-circle:before {
+    content: "\e989";
+}
+.pi-android:before {
+    content: "\e985";
+}
+.pi-google:before {
+    content: "\e986";
+}
+.pi-apple:before {
+    content: "\e987";
+}
+.pi-microsoft:before {
+    content: "\e988";
+}
+.pi-heart:before {
+    content: "\e984";
+}
+.pi-mobile:before {
+    content: "\e982";
+}
+.pi-tablet:before {
+    content: "\e983";
+}
+.pi-key:before {
+    content: "\e981";
+}
+.pi-shopping-cart:before {
+    content: "\e980";
+}
+.pi-comments:before {
+    content: "\e97e";
+}
+.pi-comment:before {
+    content: "\e97f";
+}
+.pi-briefcase:before {
+    content: "\e97d";
+}
+.pi-bell:before {
+    content: "\e97c";
+}
+.pi-paperclip:before {
+    content: "\e97b";
+}
+.pi-share-alt:before {
+    content: "\e97a";
+}
+.pi-envelope:before {
+    content: "\e979";
+}
+.pi-volume-down:before {
+    content: "\e976";
+}
+.pi-volume-up:before {
+    content: "\e977";
+}
+.pi-volume-off:before {
+    content: "\e978";
+}
+.pi-eject:before {
+    content: "\e975";
+}
+.pi-money-bill:before {
+    content: "\e974";
+}
+.pi-images:before {
+    content: "\e973";
+}
+.pi-image:before {
+    content: "\e972";
+}
+.pi-sign-in:before {
+    content: "\e970";
+}
+.pi-sign-out:before {
+    content: "\e971";
+}
+.pi-wifi:before {
+    content: "\e96f";
+}
+.pi-sitemap:before {
+    content: "\e96e";
+}
+.pi-chart-bar:before {
+    content: "\e96d";
+}
+.pi-camera:before {
+    content: "\e96c";
+}
+.pi-dollar:before {
+    content: "\e96b";
+}
+.pi-lock-open:before {
+    content: "\e96a";
+}
+.pi-table:before {
+    content: "\e969";
+}
+.pi-map-marker:before {
+    content: "\e968";
+}
+.pi-list:before {
+    content: "\e967";
+}
+.pi-eye-slash:before {
+    content: "\e965";
+}
+.pi-eye:before {
+    content: "\e966";
+}
+.pi-folder-open:before {
+    content: "\e964";
+}
+.pi-folder:before {
+    content: "\e963";
+}
+.pi-video:before {
+    content: "\e962";
+}
+.pi-inbox:before {
+    content: "\e961";
+}
+.pi-lock:before {
+    content: "\e95f";
+}
+.pi-unlock:before {
+    content: "\e960";
+}
+.pi-tags:before {
+    content: "\e95d";
+}
+.pi-tag:before {
+    content: "\e95e";
+}
+.pi-power-off:before {
+    content: "\e95c";
+}
+.pi-save:before {
+    content: "\e95b";
+}
+.pi-question-circle:before {
+    content: "\e959";
+}
+.pi-question:before {
+    content: "\e95a";
+}
+.pi-copy:before {
+    content: "\e957";
+}
+.pi-file:before {
+    content: "\e958";
+}
+.pi-clone:before {
+    content: "\e955";
+}
+.pi-calendar-times:before {
+    content: "\e952";
+}
+.pi-calendar-minus:before {
+    content: "\e953";
+}
+.pi-calendar-plus:before {
+    content: "\e954";
+}
+.pi-ellipsis-v:before {
+    content: "\e950";
+}
+.pi-ellipsis-h:before {
+    content: "\e951";
+}
+.pi-bookmark:before {
+    content: "\e94e";
+}
+.pi-globe:before {
+    content: "\e94f";
+}
+.pi-replay:before {
+    content: "\e94d";
+}
+.pi-filter:before {
+    content: "\e94c";
+}
+.pi-print:before {
+    content: "\e94b";
+}
+.pi-align-right:before {
+    content: "\e946";
+}
+.pi-align-left:before {
+    content: "\e947";
+}
+.pi-align-center:before {
+    content: "\e948";
+}
+.pi-align-justify:before {
+    content: "\e949";
+}
+.pi-cog:before {
+    content: "\e94a";
+}
+.pi-cloud-download:before {
+    content: "\e943";
+}
+.pi-cloud-upload:before {
+    content: "\e944";
+}
+.pi-cloud:before {
+    content: "\e945";
+}
+.pi-pencil:before {
+    content: "\e942";
+}
+.pi-users:before {
+    content: "\e941";
+}
+.pi-clock:before {
+    content: "\e940";
+}
+.pi-user-minus:before {
+    content: "\e93e";
+}
+.pi-user-plus:before {
+    content: "\e93f";
+}
+.pi-trash:before {
+    content: "\e93d";
+}
+.pi-external-link:before {
+    content: "\e93c";
+}
+.pi-window-maximize:before {
+    content: "\e93b";
+}
+.pi-window-minimize:before {
+    content: "\e93a";
+}
+.pi-refresh:before {
+    content: "\e938";
+}
+.pi-user:before {
+    content: "\e939";
+}
+.pi-exclamation-triangle:before {
+    content: "\e922";
+}
+.pi-calendar:before {
+    content: "\e927";
+}
+.pi-chevron-circle-left:before {
+    content: "\e928";
+}
+.pi-chevron-circle-down:before {
+    content: "\e929";
+}
+.pi-chevron-circle-right:before {
+    content: "\e92a";
+}
+.pi-chevron-circle-up:before {
+    content: "\e92b";
+}
+.pi-angle-double-down:before {
+    content: "\e92c";
+}
+.pi-angle-double-left:before {
+    content: "\e92d";
+}
+.pi-angle-double-right:before {
+    content: "\e92e";
+}
+.pi-angle-double-up:before {
+    content: "\e92f";
+}
+.pi-angle-down:before {
+    content: "\e930";
+}
+.pi-angle-left:before {
+    content: "\e931";
+}
+.pi-angle-right:before {
+    content: "\e932";
+}
+.pi-angle-up:before {
+    content: "\e933";
+}
+.pi-upload:before {
+    content: "\e934";
+}
+.pi-download:before {
+    content: "\e956";
+}
+.pi-ban:before {
+    content: "\e935";
+}
+.pi-star-fill:before {
+    content: "\e936";
+}
+.pi-star:before {
+    content: "\e937";
+}
+.pi-chevron-left:before {
+    content: "\e900";
+}
+.pi-chevron-right:before {
+    content: "\e901";
+}
+.pi-chevron-down:before {
+    content: "\e902";
+}
+.pi-chevron-up:before {
+    content: "\e903";
+}
+.pi-caret-left:before {
+    content: "\e904";
+}
+.pi-caret-right:before {
+    content: "\e905";
+}
+.pi-caret-down:before {
+    content: "\e906";
+}
+.pi-caret-up:before {
+    content: "\e907";
+}
+.pi-search:before {
+    content: "\e908";
+}
+.pi-check:before {
+    content: "\e909";
+}
+.pi-check-circle:before {
+    content: "\e90a";
+}
+.pi-times:before {
+    content: "\e90b";
+}
+.pi-times-circle:before {
+    content: "\e90c";
+}
+.pi-plus:before {
+    content: "\e90d";
+}
+.pi-plus-circle:before {
+    content: "\e90e";
+}
+.pi-minus:before {
+    content: "\e90f";
+}
+.pi-minus-circle:before {
+    content: "\e910";
+}
+.pi-circle-on:before {
+    content: "\e911";
+}
+.pi-circle-off:before {
+    content: "\e912";
+}
+.pi-sort-down:before {
+    content: "\e913";
+}
+.pi-sort-up:before {
+    content: "\e914";
+}
+.pi-sort:before {
+    content: "\e915";
+}
+.pi-step-backward:before {
+    content: "\e916";
+}
+.pi-step-forward:before {
+    content: "\e917";
+}
+.pi-th-large:before {
+    content: "\e918";
+}
+.pi-arrow-down:before {
+    content: "\e919";
+}
+.pi-arrow-left:before {
+    content: "\e91a";
+}
+.pi-arrow-right:before {
+    content: "\e91b";
+}
+.pi-arrow-up:before {
+    content: "\e91c";
+}
+.pi-bars:before {
+    content: "\e91d";
+}
+.pi-arrow-circle-down:before {
+    content: "\e91e";
+}
+.pi-arrow-circle-left:before {
+    content: "\e91f";
+}
+.pi-arrow-circle-right:before {
+    content: "\e920";
+}
+.pi-arrow-circle-up:before {
+    content: "\e921";
+}
+.pi-info:before {
+    content: "\e923";
+}
+.pi-info-circle:before {
+    content: "\e924";
+}
+.pi-home:before {
+    content: "\e925";
+}
+.pi-spinner:before {
+    content: "\e926";
+}
+@layer primevue, tailwind-utilities;
+@layer tailwind-utilities {
+  .container{
+    width: 100%;
+  }
+  @media (min-width: 640px){
+    .container{
+      max-width: 640px;
+    }
+  }
+  @media (min-width: 768px){
+    .container{
+      max-width: 768px;
+    }
+  }
+  @media (min-width: 1024px){
+    .container{
+      max-width: 1024px;
+    }
+  }
+  @media (min-width: 1280px){
+    .container{
+      max-width: 1280px;
+    }
+  }
+  @media (min-width: 1536px){
+    .container{
+      max-width: 1536px;
+    }
+  }
+  @media (min-width: 1800px){
+    .container{
+      max-width: 1800px;
+    }
+  }
+  @media (min-width: 2500px){
+    .container{
+      max-width: 2500px;
+    }
+  }
+  @media (min-width: 3200px){
+    .container{
+      max-width: 3200px;
+    }
+  }
+  .pointer-events-none{
+    pointer-events: none;
+  }
+  .pointer-events-auto{
+    pointer-events: auto;
+  }
+  .\!visible{
+    visibility: visible !important;
+  }
+  .visible{
+    visibility: visible;
+  }
+  .invisible{
+    visibility: hidden;
+  }
+  .collapse{
+    visibility: collapse;
+  }
+  .static{
+    position: static;
+  }
+  .fixed{
+    position: fixed;
+  }
+  .absolute{
+    position: absolute;
+  }
+  .relative{
+    position: relative;
+  }
+  .inset-0{
+    inset: 0px;
+  }
+  .-bottom-4{
+    bottom: -1rem;
+  }
+  .-right-14{
+    right: -3.5rem;
+  }
+  .-right-4{
+    right: -1rem;
+  }
+  .bottom-\[10px\]{
+    bottom: 10px;
+  }
+  .bottom-full{
+    bottom: 100%;
+  }
+  .left-0{
+    left: 0px;
+  }
+  .left-\[-350px\]{
+    left: -350px;
+  }
+  .right-\[10px\]{
+    right: 10px;
+  }
+  .top-0{
+    top: 0px;
+  }
+  .top-\[50px\]{
+    top: 50px;
+  }
+  .top-auto{
+    top: auto;
+  }
+  .z-10{
+    z-index: 10;
+  }
+  .z-\[1000\]{
+    z-index: 1000;
+  }
+  .z-\[9999\]{
+    z-index: 9999;
+  }
+  .col-span-full{
+    grid-column: 1 / -1;
+  }
+  .row-span-full{
+    grid-row: 1 / -1;
+  }
+  .m-0{
+    margin: 0px;
+  }
+  .m-1{
+    margin: 0.25rem;
+  }
+  .m-12{
+    margin: 3rem;
+  }
+  .m-2{
+    margin: 0.5rem;
+  }
+  .m-8{
+    margin: 2rem;
+  }
+  .mx-1{
+    margin-left: 0.25rem;
+    margin-right: 0.25rem;
+  }
+  .mx-2{
+    margin-left: 0.5rem;
+    margin-right: 0.5rem;
+  }
+  .mx-6{
+    margin-left: 1.5rem;
+    margin-right: 1.5rem;
+  }
+  .my-0{
+    margin-top: 0px;
+    margin-bottom: 0px;
+  }
+  .my-1{
+    margin-top: 0.25rem;
+    margin-bottom: 0.25rem;
+  }
+  .my-2{
+    margin-top: 0.5rem;
+    margin-bottom: 0.5rem;
+  }
+  .my-2\.5{
+    margin-top: 0.625rem;
+    margin-bottom: 0.625rem;
+  }
+  .my-4{
+    margin-top: 1rem;
+    margin-bottom: 1rem;
+  }
+  .mb-2{
+    margin-bottom: 0.5rem;
+  }
+  .mb-3{
+    margin-bottom: 0.75rem;
+  }
+  .mb-4{
+    margin-bottom: 1rem;
+  }
+  .mb-6{
+    margin-bottom: 1.5rem;
+  }
+  .mb-7{
+    margin-bottom: 1.75rem;
+  }
+  .ml-2{
+    margin-left: 0.5rem;
+  }
+  .ml-\[-13px\]{
+    margin-left: -13px;
+  }
+  .ml-auto{
+    margin-left: auto;
+  }
+  .mr-1{
+    margin-right: 0.25rem;
+  }
+  .mr-2{
+    margin-right: 0.5rem;
+  }
+  .mt-0{
+    margin-top: 0px;
+  }
+  .mt-1{
+    margin-top: 0.25rem;
+  }
+  .mt-2{
+    margin-top: 0.5rem;
+  }
+  .mt-24{
+    margin-top: 6rem;
+  }
+  .mt-4{
+    margin-top: 1rem;
+  }
+  .mt-5{
+    margin-top: 1.25rem;
+  }
+  .mt-6{
+    margin-top: 1.5rem;
+  }
+  .block{
+    display: block;
+  }
+  .inline-block{
+    display: inline-block;
+  }
+  .inline{
+    display: inline;
+  }
+  .flex{
+    display: flex;
+  }
+  .inline-flex{
+    display: inline-flex;
+  }
+  .table{
+    display: table;
+  }
+  .grid{
+    display: grid;
+  }
+  .contents{
+    display: contents;
+  }
+  .hidden{
+    display: none;
+  }
+  .h-0{
+    height: 0px;
+  }
+  .h-1{
+    height: 0.25rem;
+  }
+  .h-1\/2{
+    height: 50%;
+  }
+  .h-16{
+    height: 4rem;
+  }
+  .h-6{
+    height: 1.5rem;
+  }
+  .h-64{
+    height: 16rem;
+  }
+  .h-8{
+    height: 2rem;
+  }
+  .h-96{
+    height: 26rem;
+  }
+  .h-\[22px\]{
+    height: 22px;
+  }
+  .h-\[30rem\]{
+    height: 30rem;
+  }
+  .h-\[var\(--comfy-topbar-height\)\]{
+    height: var(--comfy-topbar-height);
+  }
+  .h-full{
+    height: 100%;
+  }
+  .h-screen{
+    height: 100vh;
+  }
+  .max-h-96{
+    max-height: 26rem;
+  }
+  .max-h-full{
+    max-height: 100%;
+  }
+  .min-h-52{
+    min-height: 13rem;
+  }
+  .min-h-8{
+    min-height: 2rem;
+  }
+  .min-h-full{
+    min-height: 100%;
+  }
+  .min-h-screen{
+    min-height: 100vh;
+  }
+  .w-1\/2{
+    width: 50%;
+  }
+  .w-12{
+    width: 3rem;
+  }
+  .w-14{
+    width: 3.5rem;
+  }
+  .w-16{
+    width: 4rem;
+  }
+  .w-28{
+    width: 7rem;
+  }
+  .w-3\/12{
+    width: 25%;
+  }
+  .w-44{
+    width: 11rem;
+  }
+  .w-48{
+    width: 12rem;
+  }
+  .w-6{
+    width: 1.5rem;
+  }
+  .w-64{
+    width: 16rem;
+  }
+  .w-8{
+    width: 2rem;
+  }
+  .w-\[22px\]{
+    width: 22px;
+  }
+  .w-\[600px\]{
+    width: 600px;
+  }
+  .w-auto{
+    width: auto;
+  }
+  .w-fit{
+    width: -moz-fit-content;
+    width: fit-content;
+  }
+  .w-full{
+    width: 100%;
+  }
+  .w-screen{
+    width: 100vw;
+  }
+  .min-w-0{
+    min-width: 0px;
+  }
+  .min-w-110{
+    min-width: 32rem;
+  }
+  .min-w-32{
+    min-width: 8rem;
+  }
+  .min-w-84{
+    min-width: 22rem;
+  }
+  .min-w-96{
+    min-width: 26rem;
+  }
+  .min-w-full{
+    min-width: 100%;
+  }
+  .max-w-110{
+    max-width: 32rem;
+  }
+  .max-w-48{
+    max-width: 12rem;
+  }
+  .max-w-64{
+    max-width: 16rem;
+  }
+  .max-w-\[150px\]{
+    max-width: 150px;
+  }
+  .max-w-\[600px\]{
+    max-width: 600px;
+  }
+  .max-w-full{
+    max-width: 100%;
+  }
+  .max-w-screen-sm{
+    max-width: 640px;
+  }
+  .flex-1{
+    flex: 1 1 0%;
+  }
+  .flex-shrink-0{
+    flex-shrink: 0;
+  }
+  .shrink-0{
+    flex-shrink: 0;
+  }
+  .flex-grow{
+    flex-grow: 1;
+  }
+  .grow{
+    flex-grow: 1;
+  }
+  .border-collapse{
+    border-collapse: collapse;
+  }
+  .-translate-y-40{
+    --tw-translate-y: -10rem;
+    transform: translate(var(--tw-translate-x), var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));
+  }
+  .scale-75{
+    --tw-scale-x: .75;
+    --tw-scale-y: .75;
+    transform: translate(var(--tw-translate-x), var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));
+  }
+  .transform{
+    transform: translate(var(--tw-translate-x), var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));
+  }
+  .cursor-move{
+    cursor: move;
+  }
+  .cursor-pointer{
+    cursor: pointer;
+  }
+  .select-none{
+    -webkit-user-select: none;
+       -moz-user-select: none;
+            user-select: none;
+  }
+  .resize{
+    resize: both;
+  }
+  .list-inside{
+    list-style-position: inside;
+  }
+  .list-disc{
+    list-style-type: disc;
+  }
+  .grid-cols-2{
+    grid-template-columns: repeat(2, minmax(0, 1fr));
+  }
+  .flex-row{
+    flex-direction: row;
+  }
+  .flex-row-reverse{
+    flex-direction: row-reverse;
+  }
+  .flex-col{
+    flex-direction: column;
+  }
+  .flex-wrap{
+    flex-wrap: wrap;
+  }
+  .flex-nowrap{
+    flex-wrap: nowrap;
+  }
+  .content-center{
+    align-content: center;
+  }
+  .items-center{
+    align-items: center;
+  }
+  .justify-end{
+    justify-content: flex-end;
+  }
+  .justify-center{
+    justify-content: center;
+  }
+  .justify-between{
+    justify-content: space-between;
+  }
+  .justify-around{
+    justify-content: space-around;
+  }
+  .justify-evenly{
+    justify-content: space-evenly;
+  }
+  .gap-0{
+    gap: 0px;
+  }
+  .gap-1{
+    gap: 0.25rem;
+  }
+  .gap-2{
+    gap: 0.5rem;
+  }
+  .gap-3{
+    gap: 0.75rem;
+  }
+  .gap-4{
+    gap: 1rem;
+  }
+  .gap-6{
+    gap: 1.5rem;
+  }
+  .gap-8{
+    gap: 2rem;
+  }
+  .space-x-1 > :not([hidden]) ~ :not([hidden]){
+    --tw-space-x-reverse: 0;
+    margin-right: calc(0.25rem * var(--tw-space-x-reverse));
+    margin-left: calc(0.25rem * calc(1 - var(--tw-space-x-reverse)));
+  }
+  .space-y-1 > :not([hidden]) ~ :not([hidden]){
+    --tw-space-y-reverse: 0;
+    margin-top: calc(0.25rem * calc(1 - var(--tw-space-y-reverse)));
+    margin-bottom: calc(0.25rem * var(--tw-space-y-reverse));
+  }
+  .space-y-2 > :not([hidden]) ~ :not([hidden]){
+    --tw-space-y-reverse: 0;
+    margin-top: calc(0.5rem * calc(1 - var(--tw-space-y-reverse)));
+    margin-bottom: calc(0.5rem * var(--tw-space-y-reverse));
+  }
+  .space-y-4 > :not([hidden]) ~ :not([hidden]){
+    --tw-space-y-reverse: 0;
+    margin-top: calc(1rem * calc(1 - var(--tw-space-y-reverse)));
+    margin-bottom: calc(1rem * var(--tw-space-y-reverse));
+  }
+  .place-self-end{
+    place-self: end;
+  }
+  .justify-self-end{
+    justify-self: end;
+  }
+  .overflow-auto{
+    overflow: auto;
+  }
+  .overflow-hidden{
+    overflow: hidden;
+  }
+  .overflow-y-auto{
+    overflow-y: auto;
+  }
+  .overflow-x-hidden{
+    overflow-x: hidden;
+  }
+  .truncate{
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+  }
+  .text-ellipsis{
+    text-overflow: ellipsis;
+  }
+  .whitespace-nowrap{
+    white-space: nowrap;
+  }
+  .whitespace-pre-line{
+    white-space: pre-line;
+  }
+  .text-wrap{
+    text-wrap: wrap;
+  }
+  .text-nowrap{
+    text-wrap: nowrap;
+  }
+  .rounded{
+    border-radius: 0.25rem;
+  }
+  .rounded-lg{
+    border-radius: 0.5rem;
+  }
+  .rounded-none{
+    border-radius: 0px;
+  }
+  .rounded-t-lg{
+    border-top-left-radius: 0.5rem;
+    border-top-right-radius: 0.5rem;
+  }
+  .border{
+    border-width: 1px;
+  }
+  .border-0{
+    border-width: 0px;
+  }
+  .border-x-0{
+    border-left-width: 0px;
+    border-right-width: 0px;
+  }
+  .border-y{
+    border-top-width: 1px;
+    border-bottom-width: 1px;
+  }
+  .border-b{
+    border-bottom-width: 1px;
+  }
+  .border-l{
+    border-left-width: 1px;
+  }
+  .border-r{
+    border-right-width: 1px;
+  }
+  .border-t-0{
+    border-top-width: 0px;
+  }
+  .border-solid{
+    border-style: solid;
+  }
+  .border-hidden{
+    border-style: hidden;
+  }
+  .border-none{
+    border-style: none;
+  }
+  .border-neutral-700{
+    --tw-border-opacity: 1;
+    border-color: rgb(64 64 64 / var(--tw-border-opacity));
+  }
+  .bg-\[var\(--comfy-menu-bg\)\]{
+    background-color: var(--comfy-menu-bg);
+  }
+  .bg-\[var\(--p-tree-background\)\]{
+    background-color: var(--p-tree-background);
+  }
+  .bg-black{
+    --tw-bg-opacity: 1;
+    background-color: rgb(0 0 0 / var(--tw-bg-opacity));
+  }
+  .bg-blue-500{
+    --tw-bg-opacity: 1;
+    background-color: rgb(66 153 225 / var(--tw-bg-opacity));
+  }
+  .bg-gray-100{
+    --tw-bg-opacity: 1;
+    background-color: rgb(243 246 250 / var(--tw-bg-opacity));
+  }
+  .bg-gray-800{
+    --tw-bg-opacity: 1;
+    background-color: rgb(45 55 72 / var(--tw-bg-opacity));
+  }
+  .bg-green-500{
+    --tw-bg-opacity: 1;
+    background-color: rgb(150 206 76 / var(--tw-bg-opacity));
+  }
+  .bg-neutral-300{
+    --tw-bg-opacity: 1;
+    background-color: rgb(212 212 212 / var(--tw-bg-opacity));
+  }
+  .bg-neutral-700{
+    --tw-bg-opacity: 1;
+    background-color: rgb(64 64 64 / var(--tw-bg-opacity));
+  }
+  .bg-neutral-800{
+    --tw-bg-opacity: 1;
+    background-color: rgb(38 38 38 / var(--tw-bg-opacity));
+  }
+  .bg-neutral-900{
+    --tw-bg-opacity: 1;
+    background-color: rgb(23 23 23 / var(--tw-bg-opacity));
+  }
+  .bg-red-500{
+    --tw-bg-opacity: 1;
+    background-color: rgb(239 68 68 / var(--tw-bg-opacity));
+  }
+  .bg-red-700{
+    --tw-bg-opacity: 1;
+    background-color: rgb(185 28 28 / var(--tw-bg-opacity));
+  }
+  .bg-transparent{
+    background-color: transparent;
+  }
+  .bg-opacity-50{
+    --tw-bg-opacity: 0.5;
+  }
+  .bg-\[url\(\'\/assets\/images\/Git-Logo-White\.svg\'\)\]{
+    background-image: url('../assets/images/Git-Logo-White.svg');
+  }
+  .bg-right-top{
+    background-position: right top;
+  }
+  .bg-no-repeat{
+    background-repeat: no-repeat;
+  }
+  .bg-origin-padding{
+    background-origin: padding-box;
+  }
+  .object-contain{
+    -o-object-fit: contain;
+       object-fit: contain;
+  }
+  .object-cover{
+    -o-object-fit: cover;
+       object-fit: cover;
+  }
+  .p-0{
+    padding: 0px;
+  }
+  .p-1{
+    padding: 0.25rem;
+  }
+  .p-2{
+    padding: 0.5rem;
+  }
+  .p-3{
+    padding: 0.75rem;
+  }
+  .p-4{
+    padding: 1rem;
+  }
+  .p-5{
+    padding: 1.25rem;
+  }
+  .p-6{
+    padding: 1.5rem;
+  }
+  .p-8{
+    padding: 2rem;
+  }
+  .px-0{
+    padding-left: 0px;
+    padding-right: 0px;
+  }
+  .px-10{
+    padding-left: 2.5rem;
+    padding-right: 2.5rem;
+  }
+  .px-2{
+    padding-left: 0.5rem;
+    padding-right: 0.5rem;
+  }
+  .px-4{
+    padding-left: 1rem;
+    padding-right: 1rem;
+  }
+  .py-0{
+    padding-top: 0px;
+    padding-bottom: 0px;
+  }
+  .py-1{
+    padding-top: 0.25rem;
+    padding-bottom: 0.25rem;
+  }
+  .pb-0{
+    padding-bottom: 0px;
+  }
+  .pl-4{
+    padding-left: 1rem;
+  }
+  .pl-6{
+    padding-left: 1.5rem;
+  }
+  .pr-0{
+    padding-right: 0px;
+  }
+  .pr-2{
+    padding-right: 0.5rem;
+  }
+  .pt-2{
+    padding-top: 0.5rem;
+  }
+  .pt-4{
+    padding-top: 1rem;
+  }
+  .pt-6{
+    padding-top: 1.5rem;
+  }
+  .pt-8{
+    padding-top: 2rem;
+  }
+  .text-center{
+    text-align: center;
+  }
+  .text-right{
+    text-align: right;
+  }
+  .font-mono{
+    font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
+  }
+  .font-sans{
+    font-family: ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
+  }
+  .text-2xl{
+    font-size: 1.5rem;
+  }
+  .text-3xl{
+    font-size: 1.875rem;
+  }
+  .text-4xl{
+    font-size: 2.25rem;
+  }
+  .text-lg{
+    font-size: 1.125rem;
+  }
+  .text-sm{
+    font-size: 0.875rem;
+  }
+  .text-xl{
+    font-size: 1.25rem;
+  }
+  .text-xs{
+    font-size: 0.75rem;
+  }
+  .font-bold{
+    font-weight: 700;
+  }
+  .font-light{
+    font-weight: 300;
+  }
+  .font-medium{
+    font-weight: 500;
+  }
+  .font-normal{
+    font-weight: 400;
+  }
+  .font-semibold{
+    font-weight: 600;
+  }
+  .uppercase{
+    text-transform: uppercase;
+  }
+  .italic{
+    font-style: italic;
+  }
+  .text-blue-400{
+    --tw-text-opacity: 1;
+    color: rgb(99 179 237 / var(--tw-text-opacity));
+  }
+  .text-gray-400{
+    --tw-text-opacity: 1;
+    color: rgb(203 213 224 / var(--tw-text-opacity));
+  }
+  .text-green-500{
+    --tw-text-opacity: 1;
+    color: rgb(150 206 76 / var(--tw-text-opacity));
+  }
+  .text-highlight{
+    color: var(--p-primary-color);
+  }
+  .text-muted{
+    color: var(--p-text-muted-color);
+  }
+  .text-neutral-100{
+    --tw-text-opacity: 1;
+    color: rgb(245 245 245 / var(--tw-text-opacity));
+  }
+  .text-neutral-200{
+    --tw-text-opacity: 1;
+    color: rgb(229 229 229 / var(--tw-text-opacity));
+  }
+  .text-neutral-300{
+    --tw-text-opacity: 1;
+    color: rgb(212 212 212 / var(--tw-text-opacity));
+  }
+  .text-neutral-400{
+    --tw-text-opacity: 1;
+    color: rgb(163 163 163 / var(--tw-text-opacity));
+  }
+  .text-neutral-800{
+    --tw-text-opacity: 1;
+    color: rgb(38 38 38 / var(--tw-text-opacity));
+  }
+  .text-neutral-900{
+    --tw-text-opacity: 1;
+    color: rgb(23 23 23 / var(--tw-text-opacity));
+  }
+  .text-red-500{
+    --tw-text-opacity: 1;
+    color: rgb(239 68 68 / var(--tw-text-opacity));
+  }
+  .underline{
+    text-decoration-line: underline;
+  }
+  .no-underline{
+    text-decoration-line: none;
+  }
+  .antialiased{
+    -webkit-font-smoothing: antialiased;
+    -moz-osx-font-smoothing: grayscale;
+  }
+  .opacity-0{
+    opacity: 0;
+  }
+  .opacity-100{
+    opacity: 1;
+  }
+  .opacity-15{
+    opacity: 0.15;
+  }
+  .opacity-25{
+    opacity: 0.25;
+  }
+  .opacity-40{
+    opacity: 0.4;
+  }
+  .opacity-50{
+    opacity: 0.5;
+  }
+  .opacity-65{
+    opacity: 0.65;
+  }
+  .opacity-75{
+    opacity: 0.75;
+  }
+  .shadow-lg{
+    --tw-shadow: 0 10px 15px -3px rgb(0 0 0 / 0.1), 0 4px 6px -4px rgb(0 0 0 / 0.1);
+    --tw-shadow-colored: 0 10px 15px -3px var(--tw-shadow-color), 0 4px 6px -4px var(--tw-shadow-color);
+    box-shadow: var(--tw-ring-offset-shadow, 0 0 #0000), var(--tw-ring-shadow, 0 0 #0000), var(--tw-shadow);
+  }
+  .outline{
+    outline-style: solid;
+  }
+  .blur{
+    --tw-blur: blur(8px);
+    filter: var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow);
+  }
+  .drop-shadow{
+    --tw-drop-shadow: drop-shadow(0 1px 2px rgb(0 0 0 / 0.1)) drop-shadow(0 1px 1px rgb(0 0 0 / 0.06));
+    filter: var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow);
+  }
+  .invert{
+    --tw-invert: invert(100%);
+    filter: var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow);
+  }
+  .filter{
+    filter: var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow);
+  }
+  .backdrop-filter{
+    -webkit-backdrop-filter: var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);
+            backdrop-filter: var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);
+  }
+  .transition{
+    transition-property: color, background-color, border-color, text-decoration-color, fill, stroke, opacity, box-shadow, transform, filter, -webkit-backdrop-filter;
+    transition-property: color, background-color, border-color, text-decoration-color, fill, stroke, opacity, box-shadow, transform, filter, backdrop-filter;
+    transition-property: color, background-color, border-color, text-decoration-color, fill, stroke, opacity, box-shadow, transform, filter, backdrop-filter, -webkit-backdrop-filter;
+    transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);
+    transition-duration: 150ms;
+  }
+  .transition-all{
+    transition-property: all;
+    transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);
+    transition-duration: 150ms;
+  }
+  .transition-opacity{
+    transition-property: opacity;
+    transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);
+    transition-duration: 150ms;
+  }
+  .duration-100{
+    transition-duration: 100ms;
+  }
+  .duration-200{
+    transition-duration: 200ms;
+  }
+  .duration-300{
+    transition-duration: 300ms;
+  }
+  .ease-in{
+    transition-timing-function: cubic-bezier(0.4, 0, 1, 1);
+  }
+  .ease-in-out{
+    transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);
+  }
+  .ease-out{
+    transition-timing-function: cubic-bezier(0, 0, 0.2, 1);
+  }
+  .content-\[\'\'\]{
+    --tw-content: '';
+    content: var(--tw-content);
+  }
+}
+:root {
+  --fg-color: #000;
+  --bg-color: #fff;
+  --comfy-menu-bg: #353535;
+  --comfy-menu-secondary-bg: #292929;
+  --comfy-topbar-height: 2.5rem;
+  --comfy-input-bg: #222;
+  --input-text: #ddd;
+  --descrip-text: #999;
+  --drag-text: #ccc;
+  --error-text: #ff4444;
+  --border-color: #4e4e4e;
+  --tr-even-bg-color: #222;
+  --tr-odd-bg-color: #353535;
+  --primary-bg: #236692;
+  --primary-fg: #ffffff;
+  --primary-hover-bg: #3485bb;
+  --primary-hover-fg: #ffffff;
+  --content-bg: #e0e0e0;
+  --content-fg: #000;
+  --content-hover-bg: #adadad;
+  --content-hover-fg: #000;
+}
+@media (prefers-color-scheme: dark) {
+  :root {
+    --fg-color: #fff;
+    --bg-color: #202020;
+    --content-bg: #4e4e4e;
+    --content-fg: #fff;
+    --content-hover-bg: #222;
+    --content-hover-fg: #fff;
+  }
+}
+body {
+  width: 100vw;
+  height: 100vh;
+  margin: 0;
+  overflow: hidden;
+  grid-template-columns: auto 1fr auto;
+  grid-template-rows: auto 1fr auto;
+  background: var(--bg-color) var(--bg-img);
+  color: var(--fg-color);
+  min-height: -webkit-fill-available;
+  max-height: -webkit-fill-available;
+  min-width: -webkit-fill-available;
+  max-width: -webkit-fill-available;
+  font-family: Arial, sans-serif;
+}
+/**
+  +------------------+------------------+------------------+
+  |                                                        |
+  |  .comfyui-body-                                        |
+  |       top                                              |
+  | (spans all cols)                                       |
+  |                                                        |
+  +------------------+------------------+------------------+
+  |                  |                  |                  |
+  | .comfyui-body-   |   #graph-canvas  | .comfyui-body-   |
+  |      left        |                  |      right       |
+  |                  |                  |                  |
+  |                  |                  |                  |
+  +------------------+------------------+------------------+
+  |                                                        |
+  |  .comfyui-body-                                        |
+  |      bottom                                            |
+  | (spans all cols)                                       |
+  |                                                        |
+  +------------------+------------------+------------------+
+*/
+.comfyui-body-top {
+  order: -5;
+  /* Span across all columns */
+  grid-column: 1/-1;
+  /* Position at the first row */
+  grid-row: 1;
+  /* Top menu bar dropdown needs to be above of graph canvas splitter overlay which is z-index: 999 */
+  /* Top menu bar z-index needs to be higher than bottom menu bar z-index as by default
+  pysssss's image feed is located at body-bottom, and it can overlap with the queue button, which
+  is located in body-top. */
+  z-index: 1001;
+  display: flex;
+  flex-direction: column;
+}
+.comfyui-body-left {
+  order: -4;
+  /* Position in the first column */
+  grid-column: 1;
+  /* Position below the top element */
+  grid-row: 2;
+  z-index: 10;
+  display: flex;
+}
+.graph-canvas-container {
+  width: 100%;
+  height: 100%;
+  order: -3;
+  grid-column: 2;
+  grid-row: 2;
+  position: relative;
+  overflow: hidden;
+}
+#graph-canvas {
+  width: 100%;
+  height: 100%;
+  touch-action: none;
+}
+.comfyui-body-right {
+  order: -2;
+  z-index: 10;
+  grid-column: 3;
+  grid-row: 2;
+}
+.comfyui-body-bottom {
+  order: 4;
+  /* Span across all columns */
+  grid-column: 1/-1;
+  grid-row: 3;
+  /* Bottom menu bar dropdown needs to be above of graph canvas splitter overlay which is z-index: 999 */
+  z-index: 1000;
+  display: flex;
+  flex-direction: column;
+}
+.comfy-multiline-input {
+  background-color: var(--comfy-input-bg);
+  color: var(--input-text);
+  overflow: hidden;
+  overflow-y: auto;
+  padding: 2px;
+  resize: none;
+  border: none;
+  box-sizing: border-box;
+  font-size: var(--comfy-textarea-font-size);
+}
+.comfy-markdown {
+  /* We assign the textarea and the Tiptap editor to the same CSS grid area to stack them on top of one another. */
+  display: grid;
+}
+.comfy-markdown > textarea {
+  grid-area: 1 / 1 / 2 / 2;
+}
+.comfy-markdown .tiptap {
+  grid-area: 1 / 1 / 2 / 2;
+  background-color: var(--comfy-input-bg);
+  color: var(--input-text);
+  overflow: hidden;
+  overflow-y: auto;
+  resize: none;
+  border: none;
+  box-sizing: border-box;
+  font-size: var(--comfy-textarea-font-size);
+  height: 100%;
+  padding: 0.5em;
+}
+.comfy-markdown.editing .tiptap {
+  display: none;
+}
+.comfy-markdown .tiptap :first-child {
+  margin-top: 0;
+}
+.comfy-markdown .tiptap :last-child {
+  margin-bottom: 0;
+}
+.comfy-markdown .tiptap blockquote {
+  border-left: medium solid;
+  margin-left: 1em;
+  padding-left: 0.5em;
+}
+.comfy-markdown .tiptap pre {
+  border: thin dotted;
+  border-radius: 0.5em;
+  margin: 0.5em;
+  padding: 0.5em;
+}
+.comfy-markdown .tiptap table {
+  border-collapse: collapse;
+}
+.comfy-markdown .tiptap th {
+  text-align: left;
+  background: var(--comfy-menu-bg);
+}
+.comfy-markdown .tiptap th,
+.comfy-markdown .tiptap td {
+  padding: 0.5em;
+  border: thin solid;
+}
+.comfy-modal {
+  display: none; /* Hidden by default */
+  position: fixed; /* Stay in place */
+  z-index: 100; /* Sit on top */
+  padding: 30px 30px 10px 30px;
+  background-color: var(--comfy-menu-bg); /* Modal background */
+  color: var(--error-text);
+  box-shadow: 0 0 20px #888888;
+  border-radius: 10px;
+  top: 50%;
+  left: 50%;
+  max-width: 80vw;
+  max-height: 80vh;
+  transform: translate(-50%, -50%);
+  overflow: hidden;
+  justify-content: center;
+  font-family: monospace;
+  font-size: 15px;
+}
+.comfy-modal-content {
+  display: flex;
+  flex-direction: column;
+}
+.comfy-modal p {
+  overflow: auto;
+  white-space: pre-line; /* This will respect line breaks */
+  margin-bottom: 20px; /* Add some margin between the text and the close button*/
+}
+.comfy-modal select,
+.comfy-modal input[type='button'],
+.comfy-modal input[type='checkbox'] {
+  margin: 3px 3px 3px 4px;
+}
+.comfy-menu {
+  font-size: 15px;
+  position: absolute;
+  top: 50%;
+  right: 0;
+  text-align: center;
+  z-index: 999;
+  width: 190px;
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  color: var(--descrip-text);
+  background-color: var(--comfy-menu-bg);
+  font-family: sans-serif;
+  padding: 10px;
+  border-radius: 0 8px 8px 8px;
+  box-shadow: 3px 3px 8px rgba(0, 0, 0, 0.4);
+}
+.comfy-menu-header {
+  display: flex;
+}
+.comfy-menu-actions {
+  display: flex;
+  gap: 3px;
+  align-items: center;
+  height: 20px;
+  position: relative;
+  top: -1px;
+  font-size: 22px;
+}
+.comfy-menu .comfy-menu-actions button {
+  background-color: rgba(0, 0, 0, 0);
+  padding: 0;
+  border: none;
+  cursor: pointer;
+  font-size: inherit;
+}
+.comfy-menu .comfy-menu-actions .comfy-settings-btn {
+  font-size: 0.6em;
+}
+button.comfy-close-menu-btn {
+  font-size: 1em;
+  line-height: 12px;
+  color: #ccc;
+  position: relative;
+  top: -1px;
+}
+.comfy-menu-queue-size {
+  flex: auto;
+}
+.comfy-menu button,
+.comfy-modal button {
+  font-size: 20px;
+}
+.comfy-menu-btns {
+  margin-bottom: 10px;
+  width: 100%;
+}
+.comfy-menu-btns button {
+  font-size: 10px;
+  width: 50%;
+  color: var(--descrip-text) !important;
+}
+.comfy-menu > button {
+  width: 100%;
+}
+.comfy-btn,
+.comfy-menu > button,
+.comfy-menu-btns button,
+.comfy-menu .comfy-list button,
+.comfy-modal button {
+  color: var(--input-text);
+  background-color: var(--comfy-input-bg);
+  border-radius: 8px;
+  border-color: var(--border-color);
+  border-style: solid;
+  margin-top: 2px;
+}
+.comfy-btn:hover:not(:disabled),
+.comfy-menu > button:hover,
+.comfy-menu-btns button:hover,
+.comfy-menu .comfy-list button:hover,
+.comfy-modal button:hover,
+.comfy-menu-actions button:hover {
+  filter: brightness(1.2);
+  will-change: transform;
+  cursor: pointer;
+}
+span.drag-handle {
+  width: 10px;
+  height: 20px;
+  display: inline-block;
+  overflow: hidden;
+  line-height: 5px;
+  padding: 3px 4px;
+  cursor: move;
+  vertical-align: middle;
+  margin-top: -0.4em;
+  margin-left: -0.2em;
+  font-size: 12px;
+  font-family: sans-serif;
+  letter-spacing: 2px;
+  color: var(--drag-text);
+  text-shadow: 1px 0 1px black;
+  touch-action: none;
+}
+span.drag-handle::after {
+  content: '.. .. ..';
+}
+.comfy-queue-btn {
+  width: 100%;
+}
+.comfy-list {
+  color: var(--descrip-text);
+  background-color: var(--comfy-menu-bg);
+  margin-bottom: 10px;
+  border-color: var(--border-color);
+  border-style: solid;
+}
+.comfy-list-items {
+  overflow-y: scroll;
+  max-height: 100px;
+  min-height: 25px;
+  background-color: var(--comfy-input-bg);
+  padding: 5px;
+}
+.comfy-list h4 {
+  min-width: 160px;
+  margin: 0;
+  padding: 3px;
+  font-weight: normal;
+}
+.comfy-list-items button {
+  font-size: 10px;
+}
+.comfy-list-actions {
+  margin: 5px;
+  display: flex;
+  gap: 5px;
+  justify-content: center;
+}
+.comfy-list-actions button {
+  font-size: 12px;
+}
+button.comfy-queue-btn {
+  margin: 6px 0 !important;
+}
+.comfy-modal.comfy-settings,
+.comfy-modal.comfy-manage-templates {
+  text-align: center;
+  font-family: sans-serif;
+  color: var(--descrip-text);
+  z-index: 99;
+}
+.comfy-modal.comfy-settings input[type='range'] {
+  vertical-align: middle;
+}
+.comfy-modal.comfy-settings input[type='range'] + input[type='number'] {
+  width: 3.5em;
+}
+.comfy-modal input,
+.comfy-modal select {
+  color: var(--input-text);
+  background-color: var(--comfy-input-bg);
+  border-radius: 8px;
+  border-color: var(--border-color);
+  border-style: solid;
+  font-size: inherit;
+}
+.comfy-tooltip-indicator {
+  text-decoration: underline;
+  text-decoration-style: dashed;
+}
+@media only screen and (max-height: 850px) {
+  .comfy-menu {
+    top: 0 !important;
+    bottom: 0 !important;
+    left: auto !important;
+    right: 0 !important;
+    border-radius: 0;
+  }
+  .comfy-menu span.drag-handle {
+    display: none;
+  }
+  .comfy-menu-queue-size {
+    flex: unset;
+  }
+  .comfy-menu-header {
+    justify-content: space-between;
+  }
+  .comfy-menu-actions {
+    gap: 10px;
+    font-size: 28px;
+  }
+}
+/* Input popup */
+.graphdialog {
+  min-height: 1em;
+  background-color: var(--comfy-menu-bg);
+}
+.graphdialog .name {
+  font-size: 14px;
+  font-family: sans-serif;
+  color: var(--descrip-text);
+}
+.graphdialog button {
+  margin-top: unset;
+  vertical-align: unset;
+  height: 1.6em;
+  padding-right: 8px;
+}
+.graphdialog input,
+.graphdialog textarea,
+.graphdialog select {
+  background-color: var(--comfy-input-bg);
+  border: 2px solid;
+  border-color: var(--border-color);
+  color: var(--input-text);
+  border-radius: 12px 0 0 12px;
+}
+/* Dialogs */
+dialog {
+  box-shadow: 0 0 20px #888888;
+}
+dialog::backdrop {
+  background: rgba(0, 0, 0, 0.5);
+}
+.comfy-dialog.comfyui-dialog.comfy-modal {
+  top: 0;
+  left: 0;
+  right: 0;
+  bottom: 0;
+  transform: none;
+}
+.comfy-dialog.comfy-modal {
+  font-family: Arial, sans-serif;
+  border-color: var(--bg-color);
+  box-shadow: none;
+  border: 2px solid var(--border-color);
+}
+.comfy-dialog .comfy-modal-content {
+  flex-direction: row;
+  flex-wrap: wrap;
+  gap: 10px;
+  color: var(--fg-color);
+}
+.comfy-dialog .comfy-modal-content h3 {
+  margin-top: 0;
+}
+.comfy-dialog .comfy-modal-content > p {
+  width: 100%;
+}
+.comfy-dialog .comfy-modal-content > .comfyui-button {
+  flex: 1;
+  justify-content: center;
+}
+#comfy-settings-dialog {
+  padding: 0;
+  width: 41rem;
+}
+#comfy-settings-dialog tr > td:first-child {
+  text-align: right;
+}
+#comfy-settings-dialog tbody button,
+#comfy-settings-dialog table > button {
+  background-color: var(--bg-color);
+  border: 1px var(--border-color) solid;
+  border-radius: 0;
+  color: var(--input-text);
+  font-size: 1rem;
+  padding: 0.5rem;
+}
+#comfy-settings-dialog button:hover {
+  background-color: var(--tr-odd-bg-color);
+}
+/* General CSS for tables */
+.comfy-table {
+  border-collapse: collapse;
+  color: var(--input-text);
+  font-family: Arial, sans-serif;
+  width: 100%;
+}
+.comfy-table caption {
+  position: sticky;
+  top: 0;
+  background-color: var(--bg-color);
+  color: var(--input-text);
+  font-size: 1rem;
+  font-weight: bold;
+  padding: 8px;
+  text-align: center;
+  border-bottom: 1px solid var(--border-color);
+}
+.comfy-table caption .comfy-btn {
+  position: absolute;
+  top: -2px;
+  right: 0;
+  bottom: 0;
+  cursor: pointer;
+  border: none;
+  height: 100%;
+  border-radius: 0;
+  aspect-ratio: 1/1;
+  -webkit-user-select: none;
+     -moz-user-select: none;
+          user-select: none;
+  font-size: 20px;
+}
+.comfy-table caption .comfy-btn:focus {
+  outline: none;
+}
+.comfy-table tr:nth-child(even) {
+  background-color: var(--tr-even-bg-color);
+}
+.comfy-table tr:nth-child(odd) {
+  background-color: var(--tr-odd-bg-color);
+}
+.comfy-table td,
+.comfy-table th {
+  border: 1px solid var(--border-color);
+  padding: 8px;
+}
+/* Context menu */
+.litegraph .dialog {
+  z-index: 1;
+  font-family: Arial, sans-serif;
+}
+.litegraph .litemenu-entry.has_submenu {
+  position: relative;
+  padding-right: 20px;
+}
+.litemenu-entry.has_submenu::after {
+  content: '>';
+  position: absolute;
+  top: 0;
+  right: 2px;
+}
+.litegraph.litecontextmenu,
+.litegraph.litecontextmenu.dark {
+  z-index: 9999 !important;
+  background-color: var(--comfy-menu-bg) !important;
+}
+.litegraph.litecontextmenu
+  .litemenu-entry:hover:not(.disabled):not(.separator) {
+  background-color: var(--comfy-menu-hover-bg, var(--border-color)) !important;
+  color: var(--fg-color);
+}
+.litegraph.litecontextmenu .litemenu-entry.submenu,
+.litegraph.litecontextmenu.dark .litemenu-entry.submenu {
+  background-color: var(--comfy-menu-bg) !important;
+  color: var(--input-text);
+}
+.litegraph.litecontextmenu input {
+  background-color: var(--comfy-input-bg) !important;
+  color: var(--input-text) !important;
+}
+.comfy-context-menu-filter {
+  box-sizing: border-box;
+  border: 1px solid #999;
+  margin: 0 0 5px 5px;
+  width: calc(100% - 10px);
+}
+.comfy-img-preview {
+  pointer-events: none;
+  overflow: hidden;
+  display: flex;
+  flex-wrap: wrap;
+  align-content: flex-start;
+  justify-content: center;
+}
+.comfy-img-preview img {
+  -o-object-fit: contain;
+     object-fit: contain;
+  width: var(--comfy-img-preview-width);
+  height: var(--comfy-img-preview-height);
+}
+.comfy-missing-nodes li button {
+  font-size: 12px;
+  margin-left: 5px;
+}
+/* Search box */
+.litegraph.litesearchbox {
+  z-index: 9999 !important;
+  background-color: var(--comfy-menu-bg) !important;
+  overflow: hidden;
+  display: block;
+}
+.litegraph.litesearchbox input,
+.litegraph.litesearchbox select {
+  background-color: var(--comfy-input-bg) !important;
+  color: var(--input-text);
+}
+.litegraph.lite-search-item {
+  color: var(--input-text);
+  background-color: var(--comfy-input-bg);
+  filter: brightness(80%);
+  will-change: transform;
+  padding-left: 0.2em;
+}
+.litegraph.lite-search-item.generic_type {
+  color: var(--input-text);
+  filter: brightness(50%);
+  will-change: transform;
+}
+@media only screen and (max-width: 450px) {
+  #comfy-settings-dialog .comfy-table tbody {
+    display: grid;
+  }
+  #comfy-settings-dialog .comfy-table tr {
+    display: grid;
+  }
+  #comfy-settings-dialog tr > td:first-child {
+    text-align: center;
+    border-bottom: none;
+    padding-bottom: 0;
+  }
+  #comfy-settings-dialog tr > td:not(:first-child) {
+    text-align: center;
+    border-top: none;
+  }
+}
+audio.comfy-audio.empty-audio-widget {
+  display: none;
+}
+#vue-app {
+  position: absolute;
+  top: 0;
+  left: 0;
+  width: 100%;
+  height: 100%;
+  pointer-events: none;
+}
+/* Set auto complete panel's width as it is not accessible within vue-root */
+.p-autocomplete-overlay {
+  max-width: 25vw;
+}
+.p-tree-node-content {
+  padding: var(--comfy-tree-explorer-item-padding) !important;
+}
+/* Load3d styles */
+.comfy-load-3d,
+.comfy-load-3d-animation,
+.comfy-preview-3d,
+.comfy-preview-3d-animation{
+  display: flex;
+  flex-direction: column;
+  background: transparent;
+  flex: 1;
+  position: relative;
+  overflow: hidden;
+}
+.comfy-load-3d canvas,
+.comfy-load-3d-animation canvas,
+.comfy-preview-3d canvas,
+.comfy-preview-3d-animation canvas{
+  display: flex;
+  width: 100% !important;
+  height: 100% !important;
+}
+/* End of Load3d styles */
+/* [Desktop] Electron window specific styles */
+.app-drag {
+  app-region: drag;
+}
+.no-drag {
+  app-region: no-drag;
+}
+.window-actions-spacer {
+  width: calc(100vw - env(titlebar-area-width, 100vw));
+}
+/* End of [Desktop] Electron window specific styles */
+.hover\:bg-neutral-700:hover{
+  --tw-bg-opacity: 1;
+  background-color: rgb(64 64 64 / var(--tw-bg-opacity));
+}
+.hover\:bg-opacity-75:hover{
+  --tw-bg-opacity: 0.75;
+}
+.hover\:text-blue-300:hover{
+  --tw-text-opacity: 1;
+  color: rgb(144 205 244 / var(--tw-text-opacity));
+}
+.hover\:opacity-100:hover{
+  opacity: 1;
+}
+@media (prefers-reduced-motion: no-preference){
+  .motion-safe\:w-0{
+    width: 0px;
+  }
+  .motion-safe\:opacity-0{
+    opacity: 0;
+  }
+  .group\/sidebar-tab:focus-within .motion-safe\:group-focus-within\/sidebar-tab\:w-auto{
+    width: auto;
+  }
+  .group\/sidebar-tab:focus-within .motion-safe\:group-focus-within\/sidebar-tab\:opacity-100{
+    opacity: 1;
+  }
+  .group\/sidebar-tab:hover .motion-safe\:group-hover\/sidebar-tab\:w-auto{
+    width: auto;
+  }
+  .group\/sidebar-tab:hover .motion-safe\:group-hover\/sidebar-tab\:opacity-100{
+    opacity: 1;
+  }
+  .group\/tree-node:hover .motion-safe\:group-hover\/tree-node\:opacity-100{
+    opacity: 1;
+  }
+}
+@media not all and (min-width: 640px){
+  .max-sm\:hidden{
+    display: none;
+  }
+}
+@media (min-width: 768px){
+  .md\:flex{
+    display: flex;
+  }
+  .md\:hidden{
+    display: none;
+  }
+}
+@media (min-width: 1536px){
+  .\32xl\:mx-4{
+    margin-left: 1rem;
+    margin-right: 1rem;
+  }
+  .\32xl\:w-64{
+    width: 16rem;
+  }
+  .\32xl\:max-w-full{
+    max-width: 100%;
+  }
+  .\32xl\:p-16{
+    padding: 4rem;
+  }
+  .\32xl\:p-4{
+    padding: 1rem;
+  }
+  .\32xl\:p-\[var\(--p-dialog-content-padding\)\]{
+    padding: var(--p-dialog-content-padding);
+  }
+  .\32xl\:p-\[var\(--p-dialog-header-padding\)\]{
+    padding: var(--p-dialog-header-padding);
+  }
+  .\32xl\:px-4{
+    padding-left: 1rem;
+    padding-right: 1rem;
+  }
+  .\32xl\:text-sm{
+    font-size: 0.875rem;
+  }
+}
+@media (prefers-color-scheme: dark){
+  .dark\:bg-gray-800{
+    --tw-bg-opacity: 1;
+    background-color: rgb(45 55 72 / var(--tw-bg-opacity));
+  }
+}
+.global-dialog .p-dialog-header {
+    padding: 0.5rem
+}
+@media (min-width: 1536px) {
+.global-dialog .p-dialog-header {
+        padding: var(--p-dialog-header-padding)
+}
+}
+.global-dialog .p-dialog-header {
+    padding-bottom: 0px
+}
+.global-dialog .p-dialog-content {
+    padding: 0.5rem
+}
+@media (min-width: 1536px) {
+.global-dialog .p-dialog-content {
+        padding: var(--p-dialog-content-padding)
+}
+}
+.global-dialog .p-dialog-content {
+    padding-top: 0px
+}
+.prompt-dialog-content[data-v-3df70997] {
+  white-space: pre-wrap;
+}
+.no-results-placeholder[data-v-f2b77816] .p-card {
+  background-color: var(--surface-ground);
+  text-align: center;
+  box-shadow: unset;
+}
+.no-results-placeholder h3[data-v-f2b77816] {
+  color: var(--text-color);
+  margin-bottom: 0.5rem;
+}
+.no-results-placeholder p[data-v-f2b77816] {
+  color: var(--text-color-secondary);
+  margin-bottom: 1rem;
+}
+.comfy-error-report[data-v-3faf7785] {
+  display: flex;
+  flex-direction: column;
+  gap: 1rem;
+}
+.action-container[data-v-3faf7785] {
+  display: flex;
+  gap: 1rem;
+  justify-content: flex-end;
+}
+.wrapper-pre[data-v-3faf7785] {
+  white-space: pre-wrap;
+  word-wrap: break-word;
+}
+.comfy-missing-nodes[data-v-425cc3ac] {
+  max-height: 300px;
+  overflow-y: auto;
+}
+.node-hint[data-v-425cc3ac] {
+  margin-left: 0.5rem;
+  font-style: italic;
+  color: var(--text-color-secondary);
+}
+[data-v-425cc3ac] .p-button {
+  margin-left: auto;
+}
+.comfy-missing-models[data-v-f8d63775] {
+  max-height: 300px;
+  overflow-y: auto;
+}
+[data-v-53692f7e] .i-badge {
+    --tw-bg-opacity: 1;
+    background-color: rgb(150 206 76 / var(--tw-bg-opacity));
+    --tw-text-opacity: 1;
+    color: rgb(255 255 255 / var(--tw-text-opacity))
+}
+[data-v-53692f7e] .o-badge {
+    --tw-bg-opacity: 1;
+    background-color: rgb(239 68 68 / var(--tw-bg-opacity));
+    --tw-text-opacity: 1;
+    color: rgb(255 255 255 / var(--tw-text-opacity))
+}
+[data-v-53692f7e] .c-badge {
+    --tw-bg-opacity: 1;
+    background-color: rgb(66 153 225 / var(--tw-bg-opacity));
+    --tw-text-opacity: 1;
+    color: rgb(255 255 255 / var(--tw-text-opacity))
+}
+[data-v-53692f7e] .s-badge {
+    --tw-bg-opacity: 1;
+    background-color: rgb(234 179 8 / var(--tw-bg-opacity))
+}
+[data-v-b3ab067d] .p-inputtext {
+  --p-form-field-padding-x: 0.625rem;
+}
+.p-button.p-inputicon[data-v-b3ab067d] {
+  width: auto;
+  border-style: none;
+  padding: 0px;
+}
+.form-input[data-v-1451da7b] .input-slider .p-inputnumber input,
+.form-input[data-v-1451da7b] .input-slider .slider-part {
+    width: 5rem
+}
+.form-input[data-v-1451da7b] .p-inputtext,
+.form-input[data-v-1451da7b] .p-select {
+    width: 11rem
+}
+.settings-tab-panels {
+  padding-top: 0px !important;
+}
+.settings-container[data-v-2e21278f] {
+  display: flex;
+  height: 70vh;
+  width: 60vw;
+  max-width: 1024px;
+  overflow: hidden;
+}
+@media (max-width: 768px) {
+.settings-container[data-v-2e21278f] {
+    flex-direction: column;
+    height: auto;
+    width: 80vw;
+}
+.settings-sidebar[data-v-2e21278f] {
+    width: 100%;
+}
+.settings-content[data-v-2e21278f] {
+    height: 350px;
+}
+}
+/* Show a separator line above the Keybinding tab */
+/* This indicates the start of custom setting panels */
+.settings-sidebar[data-v-2e21278f] .p-listbox-option[aria-label='Keybinding'] {
+  position: relative;
+}
+.settings-sidebar[data-v-2e21278f] .p-listbox-option[aria-label='Keybinding']::before {
+  position: absolute;
+  top: 0px;
+  left: 0px;
+  width: 100%;
+  --tw-content: '';
+  content: var(--tw-content);
+  border-top: 1px solid var(--p-divider-border-color);
+}
+.pi-cog[data-v-43089afc] {
+  font-size: 1.25rem;
+  margin-right: 0.5rem;
+}
+.version-tag[data-v-43089afc] {
+  margin-left: 0.5rem;
+}
+.p-card[data-v-ffc83afa] {
+  --p-card-body-padding: 10px 0 0 0;
+  overflow: hidden;
+}
+[data-v-ffc83afa] .p-card-subtitle {
+  text-align: center;
+}
+.carousel[data-v-d9962275] {
+  width: 66vw;
+}
+/**
+ * Copyright (c) 2014 The xterm.js authors. All rights reserved.
+ * Copyright (c) 2012-2013, Christopher Jeffrey (MIT License)
+ * https://github.com/chjj/term.js
+ * @license MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * Originally forked from (with the author's permission):
+ *   Fabrice Bellard's javascript vt100 for jslinux:
+ *   http://bellard.org/jslinux/
+ *   Copyright (c) 2011 Fabrice Bellard
+ *   The original design remains. The terminal itself
+ *   has been extended to include xterm CSI codes, among
+ *   other features.
+ */
+/**
+ *  Default styles for xterm.js
+ */
+.xterm {
+    cursor: text;
+    position: relative;
+    -moz-user-select: none;
+         user-select: none;
+    -ms-user-select: none;
+    -webkit-user-select: none;
+}
+.xterm.focus,
+.xterm:focus {
+    outline: none;
+}
+.xterm .xterm-helpers {
+    position: absolute;
+    top: 0;
+    /**
+     * The z-index of the helpers must be higher than the canvases in order for
+     * IMEs to appear on top.
+     */
+    z-index: 5;
+}
+.xterm .xterm-helper-textarea {
+    padding: 0;
+    border: 0;
+    margin: 0;
+    /* Move textarea out of the screen to the far left, so that the cursor is not visible */
+    position: absolute;
+    opacity: 0;
+    left: -9999em;
+    top: 0;
+    width: 0;
+    height: 0;
+    z-index: -5;
+    /** Prevent wrapping so the IME appears against the textarea at the correct position */
+    white-space: nowrap;
+    overflow: hidden;
+    resize: none;
+}
+.xterm .composition-view {
+    /* TODO: Composition position got messed up somewhere */
+    background: #000;
+    color: #FFF;
+    display: none;
+    position: absolute;
+    white-space: nowrap;
+    z-index: 1;
+}
+.xterm .composition-view.active {
+    display: block;
+}
+.xterm .xterm-viewport {
+    /* On OS X this is required in order for the scroll bar to appear fully opaque */
+    background-color: #000;
+    overflow-y: scroll;
+    cursor: default;
+    position: absolute;
+    right: 0;
+    left: 0;
+    top: 0;
+    bottom: 0;
+}
+.xterm .xterm-screen {
+    position: relative;
+}
+.xterm .xterm-screen canvas {
+    position: absolute;
+    left: 0;
+    top: 0;
+}
+.xterm .xterm-scroll-area {
+    visibility: hidden;
+}
+.xterm-char-measure-element {
+    display: inline-block;
+    visibility: hidden;
+    position: absolute;
+    top: 0;
+    left: -9999em;
+    line-height: normal;
+}
+.xterm.enable-mouse-events {
+    /* When mouse events are enabled (eg. tmux), revert to the standard pointer cursor */
+    cursor: default;
+}
+.xterm.xterm-cursor-pointer,
+.xterm .xterm-cursor-pointer {
+    cursor: pointer;
+}
+.xterm.column-select.focus {
+    /* Column selection mode */
+    cursor: crosshair;
+}
+.xterm .xterm-accessibility:not(.debug),
+.xterm .xterm-message {
+    position: absolute;
+    left: 0;
+    top: 0;
+    bottom: 0;
+    right: 0;
+    z-index: 10;
+    color: transparent;
+    pointer-events: none;
+}
+.xterm .xterm-accessibility-tree:not(.debug) *::-moz-selection {
+  color: transparent;
+}
+.xterm .xterm-accessibility-tree:not(.debug) *::selection {
+  color: transparent;
+}
+.xterm .xterm-accessibility-tree {
+  -webkit-user-select: text;
+     -moz-user-select: text;
+          user-select: text;
+  white-space: pre;
+}
+.xterm .live-region {
+    position: absolute;
+    left: -9999px;
+    width: 1px;
+    height: 1px;
+    overflow: hidden;
+}
+.xterm-dim {
+    /* Dim should not apply to background, so the opacity of the foreground color is applied
+     * explicitly in the generated class and reset to 1 here */
+    opacity: 1 !important;
+}
+.xterm-underline-1 { text-decoration: underline; }
+.xterm-underline-2 { -webkit-text-decoration: double underline; text-decoration: double underline; }
+.xterm-underline-3 { -webkit-text-decoration: wavy underline; text-decoration: wavy underline; }
+.xterm-underline-4 { -webkit-text-decoration: dotted underline; text-decoration: dotted underline; }
+.xterm-underline-5 { -webkit-text-decoration: dashed underline; text-decoration: dashed underline; }
+.xterm-overline {
+    text-decoration: overline;
+}
+.xterm-overline.xterm-underline-1 { text-decoration: overline underline; }
+.xterm-overline.xterm-underline-2 { -webkit-text-decoration: overline double underline; text-decoration: overline double underline; }
+.xterm-overline.xterm-underline-3 { -webkit-text-decoration: overline wavy underline; text-decoration: overline wavy underline; }
+.xterm-overline.xterm-underline-4 { -webkit-text-decoration: overline dotted underline; text-decoration: overline dotted underline; }
+.xterm-overline.xterm-underline-5 { -webkit-text-decoration: overline dashed underline; text-decoration: overline dashed underline; }
+.xterm-strikethrough {
+    text-decoration: line-through;
+}
+.xterm-screen .xterm-decoration-container .xterm-decoration {
+	z-index: 6;
+	position: absolute;
+}
+.xterm-screen .xterm-decoration-container .xterm-decoration.xterm-decoration-top-layer {
+	z-index: 7;
+}
+.xterm-decoration-overview-ruler {
+    z-index: 8;
+    position: absolute;
+    top: 0;
+    right: 0;
+    pointer-events: none;
+}
+.xterm-decoration-top {
+    z-index: 2;
+    position: relative;
+}
+[data-v-250ab9af] .p-terminal .xterm {
+  overflow-x: auto;
+}
+[data-v-250ab9af] .p-terminal .xterm-screen {
+  background-color: black;
+  overflow-y: hidden;
+}
+[data-v-90a7f075] .p-terminal .xterm {
+  overflow-x: auto;
+}
+[data-v-90a7f075] .p-terminal .xterm-screen {
+  background-color: black;
+  overflow-y: hidden;
+}
+[data-v-03daf1c8] .p-terminal .xterm {
+  overflow-x: auto;
+}
+[data-v-03daf1c8] .p-terminal .xterm-screen {
+  background-color: black;
+  overflow-y: hidden;
+}
+.mdi.rotate270::before {
+  transform: rotate(270deg);
+}
+/* Generic */
+.comfyui-button {
+  display: flex;
+  align-items: center;
+  gap: 0.5em;
+  cursor: pointer;
+  border: none;
+  border-radius: 4px;
+  padding: 4px 8px;
+  box-sizing: border-box;
+  margin: 0;
+  transition: box-shadow 0.1s;
+}
+.comfyui-button:active {
+  box-shadow: inset 1px 1px 10px rgba(0, 0, 0, 0.5);
+}
+.comfyui-button:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+}
+.primary .comfyui-button,
+.primary.comfyui-button {
+  background-color: var(--primary-bg) !important;
+  color: var(--primary-fg) !important;
+}
+.primary .comfyui-button:not(:disabled):hover,
+.primary.comfyui-button:not(:disabled):hover {
+  background-color: var(--primary-hover-bg) !important;
+  color: var(--primary-hover-fg) !important;
+}
+/* Popup */
+.comfyui-popup {
+  position: absolute;
+  left: var(--left);
+  right: var(--right);
+  top: var(--top);
+  bottom: var(--bottom);
+  z-index: 2000;
+  max-height: calc(100vh - var(--limit) - 10px);
+  box-shadow: 3px 3px 5px 0px rgba(0, 0, 0, 0.3);
+}
+.comfyui-popup:not(.open) {
+  display: none;
+}
+.comfyui-popup.right.open {
+  border-top-left-radius: 4px;
+  border-bottom-right-radius: 4px;
+  border-bottom-left-radius: 4px;
+  overflow: hidden;
+}
+/* Split button */
+.comfyui-split-button {
+  position: relative;
+  display: flex;
+}
+.comfyui-split-primary {
+  flex: auto;
+}
+.comfyui-split-primary .comfyui-button {
+  border-top-right-radius: 0;
+  border-bottom-right-radius: 0;
+  border-right: 1px solid var(--comfy-menu-bg);
+  width: 100%;
+}
+.comfyui-split-arrow .comfyui-button {
+  border-top-left-radius: 0;
+  border-bottom-left-radius: 0;
+  padding-left: 2px;
+  padding-right: 2px;
+}
+.comfyui-split-button-popup {
+  white-space: nowrap;
+  background-color: var(--content-bg);
+  color: var(--content-fg);
+  display: flex;
+  flex-direction: column;
+  overflow: auto;
+}
+.comfyui-split-button-popup.hover {
+  z-index: 2001;
+}
+.comfyui-split-button-popup > .comfyui-button {
+  border: none;
+  background-color: transparent;
+  color: var(--fg-color);
+  padding: 8px 12px 8px 8px;
+}
+.comfyui-split-button-popup > .comfyui-button:not(:disabled):hover {
+  background-color: var(--comfy-input-bg);
+}
+/* Button group */
+.comfyui-button-group {
+  display: flex;
+  border-radius: 4px;
+  overflow: hidden;
+}
+.comfyui-button-group:empty {
+  display: none;
+}
+.comfyui-button-group > .comfyui-button,
+.comfyui-button-group > .comfyui-button-wrapper > .comfyui-button {
+  padding: 4px 10px;
+  border-radius: 0;
+}
+/* Menu */
+.comfyui-menu .mdi::before {
+  font-size: 18px;
+}
+.comfyui-menu .comfyui-button {
+  background: var(--comfy-input-bg);
+  color: var(--fg-color);
+  white-space: nowrap;
+}
+.comfyui-menu .comfyui-button:not(:disabled):hover {
+  background: var(--border-color);
+  color: var(--content-fg);
+}
+.comfyui-menu .comfyui-split-button-popup > .comfyui-button {
+  border-radius: 0;
+  background-color: transparent;
+}
+.comfyui-menu .comfyui-split-button-popup > .comfyui-button:not(:disabled):hover {
+  background-color: var(--comfy-input-bg);
+}
+.comfyui-menu .comfyui-split-button-popup.left {
+  border-top-right-radius: 4px;
+  border-bottom-left-radius: 4px;
+  border-bottom-right-radius: 4px;
+}
+.comfyui-menu .comfyui-button.popup-open {
+  background-color: var(--content-bg);
+  color: var(--content-fg);
+}
+.comfyui-menu-push {
+  margin-left: -0.8em;
+  flex: auto;
+}
+/** Send to workflow widget selection dialog */
+.comfy-widget-selection-dialog {
+  border: none;
+}
+.comfy-widget-selection-dialog div {
+  color: var(--fg-color);
+  font-family: Arial, Helvetica, sans-serif;
+}
+.comfy-widget-selection-dialog h2 {
+  margin-top: 0;
+}
+.comfy-widget-selection-dialog section {
+  width: -moz-fit-content;
+  width: fit-content;
+  display: flex;
+  flex-direction: column;
+}
+.comfy-widget-selection-item {
+  display: flex;
+  gap: 10px;
+  align-items: center;
+}
+.comfy-widget-selection-item span {
+  margin-right: auto;
+}
+.comfy-widget-selection-item span::before {
+  content: '#' attr(data-id);
+  opacity: 0.5;
+  margin-right: 5px;
+}
+.comfy-modal .comfy-widget-selection-item button {
+  font-size: 1em;
+}
+/***** Responsive *****/
+.lg.comfyui-menu .lt-lg-show {
+  display: none !important;
+}
+.comfyui-menu:not(.lg) .nlg-hide {
+  display: none !important;
+}
+/** Large screen */
+.lg.comfyui-menu>.comfyui-menu-mobile-collapse .comfyui-button span,
+.lg.comfyui-menu>.comfyui-menu-mobile-collapse.comfyui-button span {
+  display: none;
+}
+.lg.comfyui-menu>.comfyui-menu-mobile-collapse .comfyui-popup .comfyui-button span {
+  display: unset;
+}
+/** Non large screen */
+.lt-lg.comfyui-menu {
+  flex-wrap: wrap;
+}
+.lt-lg.comfyui-menu > *:not(.comfyui-menu-mobile-collapse) {
+  order: 1;
+}
+.lt-lg.comfyui-menu > .comfyui-menu-mobile-collapse {
+  order: 9999;
+  width: 100%;
+}
+.comfyui-body-bottom .lt-lg.comfyui-menu > .comfyui-menu-mobile-collapse {
+  order: -1;
+}
+.comfyui-body-bottom .lt-lg.comfyui-menu > .comfyui-menu-button {
+  top: unset;
+  bottom: 4px;
+}
+.lt-lg.comfyui-menu > .comfyui-menu-mobile-collapse.comfyui-button-group {
+  flex-wrap: wrap;
+}
+.lt-lg.comfyui-menu > .comfyui-menu-mobile-collapse .comfyui-button,
+.lt-lg.comfyui-menu > .comfyui-menu-mobile-collapse.comfyui-button {
+  padding: 10px;
+}
+.lt-lg.comfyui-menu > .comfyui-menu-mobile-collapse .comfyui-button,
+.lt-lg.comfyui-menu > .comfyui-menu-mobile-collapse .comfyui-button-wrapper {
+  width: 100%;
+}
+.lt-lg.comfyui-menu > .comfyui-menu-mobile-collapse .comfyui-popup {
+  position: static;
+  background-color: var(--comfy-input-bg);
+  max-width: unset;
+  max-height: 50vh;
+  overflow: auto;
+}
+.lt-lg.comfyui-menu:not(.expanded) > .comfyui-menu-mobile-collapse {
+  display: none;
+}
+.lt-lg .comfyui-menu-button {
+  position: absolute;
+  top: 4px;
+  right: 8px;
+}
+.lt-lg.comfyui-menu > .comfyui-menu-mobile-collapse .comfyui-view-list-popup {
+  border-radius: 0;
+}
+.lt-lg.comfyui-menu .comfyui-workflows-popup {
+  width: 100vw;
+}
+/** Small */
+.lt-md .comfyui-workflows-button-inner {
+  width: unset !important;
+}
+.lt-md  .comfyui-workflows-label {
+  display: none;
+}
+/** Extra small */
+.lt-sm .comfyui-interrupt-button {
+  margin-right: 45px;
+}
+.comfyui-body-bottom .lt-sm.comfyui-menu > .comfyui-menu-button{
+  bottom: 41px;
+}
+.editable-text[data-v-d670c40f] {
+  display: inline;
+}
+.editable-text input[data-v-d670c40f] {
+  width: 100%;
+  box-sizing: border-box;
+}
+.tree-node[data-v-654109c7] {
+  width: 100%;
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+}
+.leaf-count-badge[data-v-654109c7] {
+  margin-left: 0.5rem;
+}
+.node-content[data-v-654109c7] {
+  display: flex;
+  align-items: center;
+  flex-grow: 1;
+}
+.leaf-label[data-v-654109c7] {
+  margin-left: 0.5rem;
+}
+[data-v-654109c7] .editable-text span {
+  word-break: break-all;
+}
+[data-v-976a6d58] .tree-explorer-node-label {
+  width: 100%;
+  display: flex;
+  align-items: center;
+  margin-left: var(--p-tree-node-gap);
+  flex-grow: 1;
+}
+/*
+ * The following styles are necessary to avoid layout shift when dragging nodes over folders.
+ * By setting the position to relative on the parent and using an absolutely positioned pseudo-element,
+ * we can create a visual indicator for the drop target without affecting the layout of other elements.
+ */
+[data-v-976a6d58] .p-tree-node-content:has(.tree-folder) {
+  position: relative;
+}
+[data-v-976a6d58] .p-tree-node-content:has(.tree-folder.can-drop)::after {
+  content: '';
+  position: absolute;
+  top: 0;
+  left: 0;
+  right: 0;
+  bottom: 0;
+  border: 1px solid var(--p-content-color);
+  pointer-events: none;
+}
+[data-v-0061c432] .p-toolbar-end .p-button {
+    padding-top: 0.25rem;
+    padding-bottom: 0.25rem
+}
+@media (min-width: 1536px) {
+[data-v-0061c432] .p-toolbar-end .p-button {
+        padding-top: 0.5rem;
+        padding-bottom: 0.5rem
+}
+}
+[data-v-0061c432] .p-toolbar-start {
+    min-width: 0px;
+    flex: 1 1 0%;
+    overflow: hidden
+}
+.model_preview[data-v-32e6c4d9] {
+  background-color: var(--comfy-menu-bg);
+  font-family: 'Open Sans', sans-serif;
+  color: var(--descrip-text);
+  border: 1px solid var(--descrip-text);
+  min-width: 300px;
+  max-width: 500px;
+  width: -moz-fit-content;
+  width: fit-content;
+  height: -moz-fit-content;
+  height: fit-content;
+  z-index: 9999;
+  border-radius: 12px;
+  overflow: hidden;
+  font-size: 12px;
+  padding: 10px;
+}
+.model_preview_image[data-v-32e6c4d9] {
+  margin: auto;
+  width: -moz-fit-content;
+  width: fit-content;
+}
+.model_preview_image img[data-v-32e6c4d9] {
+  max-width: 100%;
+  max-height: 150px;
+  -o-object-fit: contain;
+     object-fit: contain;
+}
+.model_preview_title[data-v-32e6c4d9] {
+  font-weight: bold;
+  text-align: center;
+  font-size: 14px;
+}
+.model_preview_top_container[data-v-32e6c4d9] {
+  text-align: center;
+  line-height: 0.5;
+}
+.model_preview_filename[data-v-32e6c4d9],
+.model_preview_author[data-v-32e6c4d9],
+.model_preview_architecture[data-v-32e6c4d9] {
+  display: inline-block;
+  text-align: center;
+  margin: 5px;
+  font-size: 10px;
+}
+.model_preview_prefix[data-v-32e6c4d9] {
+  font-weight: bold;
+}
+.model-lib-model-icon-container[data-v-b45ea43e] {
+  display: inline-block;
+  position: relative;
+  left: 0;
+  height: 1.5rem;
+  vertical-align: top;
+  width: 0px;
+}
+.model-lib-model-icon[data-v-b45ea43e] {
+  background-size: cover;
+  background-position: center;
+  display: inline-block;
+  position: relative;
+  left: -2.2rem;
+  top: -0.1rem;
+  height: 1.7rem;
+  width: 1.7rem;
+  vertical-align: top;
+}
+[data-v-0bb2ac55] .pi-fake-spacer {
+  height: 1px;
+  width: 16px;
+}
+.slot_row[data-v-d9792337] {
+  padding: 2px;
+}
+/* Original N-Sidebar styles */
+._sb_dot[data-v-d9792337] {
+  width: 8px;
+  height: 8px;
+  border-radius: 50%;
+  background-color: grey;
+}
+.node_header[data-v-d9792337] {
+  line-height: 1;
+  padding: 8px 13px 7px;
+  margin-bottom: 5px;
+  font-size: 15px;
+  text-wrap: nowrap;
+  overflow: hidden;
+  display: flex;
+  align-items: center;
+}
+.headdot[data-v-d9792337] {
+  width: 10px;
+  height: 10px;
+  float: inline-start;
+  margin-right: 8px;
+}
+.IMAGE[data-v-d9792337] {
+  background-color: #64b5f6;
+}
+.VAE[data-v-d9792337] {
+  background-color: #ff6e6e;
+}
+.LATENT[data-v-d9792337] {
+  background-color: #ff9cf9;
+}
+.MASK[data-v-d9792337] {
+  background-color: #81c784;
+}
+.CONDITIONING[data-v-d9792337] {
+  background-color: #ffa931;
+}
+.CLIP[data-v-d9792337] {
+  background-color: #ffd500;
+}
+.MODEL[data-v-d9792337] {
+  background-color: #b39ddb;
+}
+.CONTROL_NET[data-v-d9792337] {
+  background-color: #a5d6a7;
+}
+._sb_node_preview[data-v-d9792337] {
+  background-color: var(--comfy-menu-bg);
+  font-family: 'Open Sans', sans-serif;
+  font-size: small;
+  color: var(--descrip-text);
+  border: 1px solid var(--descrip-text);
+  min-width: 300px;
+  width: -moz-min-content;
+  width: min-content;
+  height: -moz-fit-content;
+  height: fit-content;
+  z-index: 9999;
+  border-radius: 12px;
+  overflow: hidden;
+  font-size: 12px;
+  padding-bottom: 10px;
+}
+._sb_node_preview ._sb_description[data-v-d9792337] {
+  margin: 10px;
+  padding: 6px;
+  background: var(--border-color);
+  border-radius: 5px;
+  font-style: italic;
+  font-weight: 500;
+  font-size: 0.9rem;
+  word-break: break-word;
+}
+._sb_table[data-v-d9792337] {
+  display: grid;
+  grid-column-gap: 10px;
+  /* Spazio tra le colonne */
+  width: 100%;
+  /* Imposta la larghezza della tabella al 100% del contenitore */
+}
+._sb_row[data-v-d9792337] {
+  display: grid;
+  grid-template-columns: 10px 1fr 1fr 1fr 10px;
+  grid-column-gap: 10px;
+  align-items: center;
+  padding-left: 9px;
+  padding-right: 9px;
+}
+._sb_row_string[data-v-d9792337] {
+  grid-template-columns: 10px 1fr 1fr 10fr 1fr;
+}
+._sb_col[data-v-d9792337] {
+  border: 0px solid #000;
+  display: flex;
+  align-items: flex-end;
+  flex-direction: row-reverse;
+  flex-wrap: nowrap;
+  align-content: flex-start;
+  justify-content: flex-end;
+}
+._sb_inherit[data-v-d9792337] {
+  display: inherit;
+}
+._long_field[data-v-d9792337] {
+  background: var(--bg-color);
+  border: 2px solid var(--border-color);
+  margin: 5px 5px 0 5px;
+  border-radius: 10px;
+  line-height: 1.7;
+  text-wrap: nowrap;
+}
+._sb_arrow[data-v-d9792337] {
+  color: var(--fg-color);
+}
+._sb_preview_badge[data-v-d9792337] {
+  text-align: center;
+  background: var(--comfy-input-bg);
+  font-weight: bold;
+  color: var(--error-text);
+}
+._content[data-v-c4279e6b] {
+    display: flex;
+    flex-direction: column
+}
+._content[data-v-c4279e6b] > :not([hidden]) ~ :not([hidden]) {
+    --tw-space-y-reverse: 0;
+    margin-top: calc(0.5rem * calc(1 - var(--tw-space-y-reverse)));
+    margin-bottom: calc(0.5rem * var(--tw-space-y-reverse))
+}
+._footer[data-v-c4279e6b] {
+    display: flex;
+    flex-direction: column;
+    align-items: flex-end;
+    padding-top: 1rem
+}
+.node-lib-node-container[data-v-da9a8962] {
+    height: 100%;
+    width: 100%
+}
+.p-selectbutton .p-button[data-v-bd06e12b] {
+  padding: 0.5rem;
+}
+.p-selectbutton .p-button .pi[data-v-bd06e12b] {
+  font-size: 1.5rem;
+}
+.field[data-v-bd06e12b] {
+  display: flex;
+  flex-direction: column;
+  gap: 0.5rem;
+}
+.color-picker-container[data-v-bd06e12b] {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+}
+.scroll-container {
+&[data-v-ad33a347] {
+  height: 100%;
+  overflow-y: auto;
+  /* Firefox */
+  scrollbar-width: none;
+}
+&[data-v-ad33a347]::-webkit-scrollbar {
+    width: 1px;
+}
+&[data-v-ad33a347]::-webkit-scrollbar-thumb {
+    background-color: transparent;
+}
+}
+.comfy-image-wrap[data-v-a748ccd8] {
+  display: contents;
+}
+.comfy-image-blur[data-v-a748ccd8] {
+  position: absolute;
+  top: 0;
+  left: 0;
+  width: 100%;
+  height: 100%;
+  -o-object-fit: cover;
+     object-fit: cover;
+}
+.comfy-image-main[data-v-a748ccd8] {
+  width: 100%;
+  height: 100%;
+  -o-object-fit: cover;
+     object-fit: cover;
+  -o-object-position: center;
+     object-position: center;
+  z-index: 1;
+}
+.contain .comfy-image-wrap[data-v-a748ccd8] {
+  position: relative;
+  width: 100%;
+  height: 100%;
+}
+.contain .comfy-image-main[data-v-a748ccd8] {
+  -o-object-fit: contain;
+     object-fit: contain;
+  -webkit-backdrop-filter: blur(10px);
+          backdrop-filter: blur(10px);
+  position: absolute;
+}
+.broken-image-placeholder[data-v-a748ccd8] {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  justify-content: center;
+  width: 100%;
+  height: 100%;
+  margin: 2rem;
+}
+.broken-image-placeholder i[data-v-a748ccd8] {
+  font-size: 3rem;
+  margin-bottom: 0.5rem;
+}
+/* PrimeVue's galleria teleports the fullscreen gallery out of subtree so we
+cannot use scoped style here. */
+img.galleria-image {
+  max-width: 100vw;
+  max-height: 100vh;
+  -o-object-fit: contain;
+     object-fit: contain;
+}
+.p-galleria-close-button {
+  /* Set z-index so the close button doesn't get hidden behind the image when image is large */
+  z-index: 1;
+}
+.result-container[data-v-2403edc6] {
+  width: 100%;
+  height: 100%;
+  aspect-ratio: 1 / 1;
+  overflow: hidden;
+  position: relative;
+  display: flex;
+  justify-content: center;
+  align-items: center;
+}
+.preview-mask[data-v-2403edc6] {
+  position: absolute;
+  left: 50%;
+  top: 50%;
+  transform: translate(-50%, -50%);
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  opacity: 0;
+  transition: opacity 0.3s ease;
+  z-index: 1;
+}
+.result-container:hover .preview-mask[data-v-2403edc6] {
+  opacity: 1;
+}
+.task-result-preview[data-v-b676a511] {
+  aspect-ratio: 1 / 1;
+  overflow: hidden;
+  display: flex;
+  justify-content: center;
+  align-items: center;
+  width: 100%;
+  height: 100%;
+}
+.task-result-preview i[data-v-b676a511],
+.task-result-preview span[data-v-b676a511] {
+  font-size: 2rem;
+}
+.task-item[data-v-b676a511] {
+  display: flex;
+  flex-direction: column;
+  border-radius: 4px;
+  overflow: hidden;
+  position: relative;
+}
+.task-item-details[data-v-b676a511] {
+  position: absolute;
+  bottom: 0;
+  padding: 0.6rem;
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  width: 100%;
+  z-index: 1;
+}
+.task-node-link[data-v-b676a511] {
+  padding: 2px;
+}
+/* In dark mode, transparent background color for tags is not ideal for tags that
+are floating on top of images. */
+.tag-wrapper[data-v-b676a511] {
+  background-color: var(--p-primary-contrast-color);
+  border-radius: 6px;
+  display: inline-flex;
+}
+.node-name-tag[data-v-b676a511] {
+  word-break: break-all;
+}
+.status-tag-group[data-v-b676a511] {
+  display: flex;
+  flex-direction: column;
+}
+.progress-preview-img[data-v-b676a511] {
+  width: 100%;
+  height: 100%;
+  -o-object-fit: cover;
+     object-fit: cover;
+  -o-object-position: center;
+     object-position: center;
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.py ADDED Viewed

	@@ -0,0 +1,711 @@

+#original code from https://github.com/genmoai/models under apache 2.0 license
+#adapted to ComfyUI
+from typing import List, Optional, Tuple, Union
+from functools import partial
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from comfy.ldm.modules.attention import optimized_attention
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+# import mochi_preview.dit.joint_model.context_parallel as cp
+# from mochi_preview.vae.cp_conv import cp_pass_frames, gather_all_frames
+def cast_tuple(t, length=1):
+    return t if isinstance(t, tuple) else ((t,) * length)
+class GroupNormSpatial(ops.GroupNorm):
+    """
+    GroupNorm applied per-frame.
+    """
+    def forward(self, x: torch.Tensor, *, chunk_size: int = 8):
+        B, C, T, H, W = x.shape
+        x = rearrange(x, "B C T H W -> (B T) C H W")
+        # Run group norm in chunks.
+        output = torch.empty_like(x)
+        for b in range(0, B * T, chunk_size):
+            output[b : b + chunk_size] = super().forward(x[b : b + chunk_size])
+        return rearrange(output, "(B T) C H W -> B C T H W", B=B, T=T)
+class PConv3d(ops.Conv3d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]],
+        causal: bool = True,
+        context_parallel: bool = True,
+        **kwargs,
+    ):
+        self.causal = causal
+        self.context_parallel = context_parallel
+        kernel_size = cast_tuple(kernel_size, 3)
+        stride = cast_tuple(stride, 3)
+        height_pad = (kernel_size[1] - 1) // 2
+        width_pad = (kernel_size[2] - 1) // 2
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=(1, 1, 1),
+            padding=(0, height_pad, width_pad),
+            **kwargs,
+        )
+    def forward(self, x: torch.Tensor):
+        # Compute padding amounts.
+        context_size = self.kernel_size[0] - 1
+        if self.causal:
+            pad_front = context_size
+            pad_back = 0
+        else:
+            pad_front = context_size // 2
+            pad_back = context_size - pad_front
+        # Apply padding.
+        assert self.padding_mode == "replicate"  # DEBUG
+        mode = "constant" if self.padding_mode == "zeros" else self.padding_mode
+        x = F.pad(x, (0, 0, 0, 0, pad_front, pad_back), mode=mode)
+        return super().forward(x)
+class Conv1x1(ops.Linear):
+    """*1x1 Conv implemented with a linear layer."""
+    def __init__(self, in_features: int, out_features: int, *args, **kwargs):
+        super().__init__(in_features, out_features, *args, **kwargs)
+    def forward(self, x: torch.Tensor):
+        """Forward pass.
+        Args:
+            x: Input tensor. Shape: [B, C, *] or [B, *, C].
+        Returns:
+            x: Output tensor. Shape: [B, C', *] or [B, *, C'].
+        """
+        x = x.movedim(1, -1)
+        x = super().forward(x)
+        x = x.movedim(-1, 1)
+        return x
+class DepthToSpaceTime(nn.Module):
+    def __init__(
+        self,
+        temporal_expansion: int,
+        spatial_expansion: int,
+    ):
+        super().__init__()
+        self.temporal_expansion = temporal_expansion
+        self.spatial_expansion = spatial_expansion
+    # When printed, this module should show the temporal and spatial expansion factors.
+    def extra_repr(self):
+        return f"texp={self.temporal_expansion}, sexp={self.spatial_expansion}"
+    def forward(self, x: torch.Tensor):
+        """Forward pass.
+        Args:
+            x: Input tensor. Shape: [B, C, T, H, W].
+        Returns:
+            x: Rearranged tensor. Shape: [B, C/(st*s*s), T*st, H*s, W*s].
+        """
+        x = rearrange(
+            x,
+            "B (C st sh sw) T H W -> B C (T st) (H sh) (W sw)",
+            st=self.temporal_expansion,
+            sh=self.spatial_expansion,
+            sw=self.spatial_expansion,
+        )
+        # cp_rank, _ = cp.get_cp_rank_size()
+        if self.temporal_expansion > 1: # and cp_rank == 0:
+            # Drop the first self.temporal_expansion - 1 frames.
+            # This is because we always want the 3x3x3 conv filter to only apply
+            # to the first frame, and the first frame doesn't need to be repeated.
+            assert all(x.shape)
+            x = x[:, :, self.temporal_expansion - 1 :]
+            assert all(x.shape)
+        return x
+def norm_fn(
+    in_channels: int,
+    affine: bool = True,
+):
+    return GroupNormSpatial(affine=affine, num_groups=32, num_channels=in_channels)
+class ResBlock(nn.Module):
+    """Residual block that preserves the spatial dimensions."""
+    def __init__(
+        self,
+        channels: int,
+        *,
+        affine: bool = True,
+        attn_block: Optional[nn.Module] = None,
+        causal: bool = True,
+        prune_bottleneck: bool = False,
+        padding_mode: str,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.channels = channels
+        assert causal
+        self.stack = nn.Sequential(
+            norm_fn(channels, affine=affine),
+            nn.SiLU(inplace=True),
+            PConv3d(
+                in_channels=channels,
+                out_channels=channels // 2 if prune_bottleneck else channels,
+                kernel_size=(3, 3, 3),
+                stride=(1, 1, 1),
+                padding_mode=padding_mode,
+                bias=bias,
+                causal=causal,
+            ),
+            norm_fn(channels, affine=affine),
+            nn.SiLU(inplace=True),
+            PConv3d(
+                in_channels=channels // 2 if prune_bottleneck else channels,
+                out_channels=channels,
+                kernel_size=(3, 3, 3),
+                stride=(1, 1, 1),
+                padding_mode=padding_mode,
+                bias=bias,
+                causal=causal,
+            ),
+        )
+        self.attn_block = attn_block if attn_block else nn.Identity()
+    def forward(self, x: torch.Tensor):
+        """Forward pass.
+        Args:
+            x: Input tensor. Shape: [B, C, T, H, W].
+        """
+        residual = x
+        x = self.stack(x)
+        x = x + residual
+        del residual
+        return self.attn_block(x)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        head_dim: int = 32,
+        qkv_bias: bool = False,
+        out_bias: bool = True,
+        qk_norm: bool = True,
+    ) -> None:
+        super().__init__()
+        self.head_dim = head_dim
+        self.num_heads = dim // head_dim
+        self.qk_norm = qk_norm
+        self.qkv = nn.Linear(dim, 3 * dim, bias=qkv_bias)
+        self.out = nn.Linear(dim, dim, bias=out_bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute temporal self-attention.
+        Args:
+            x: Input tensor. Shape: [B, C, T, H, W].
+            chunk_size: Chunk size for large tensors.
+        Returns:
+            x: Output tensor. Shape: [B, C, T, H, W].
+        """
+        B, _, T, H, W = x.shape
+        if T == 1:
+            # No attention for single frame.
+            x = x.movedim(1, -1)  # [B, C, T, H, W] -> [B, T, H, W, C]
+            qkv = self.qkv(x)
+            _, _, x = qkv.chunk(3, dim=-1)  # Throw away queries and keys.
+            x = self.out(x)
+            return x.movedim(-1, 1)  # [B, T, H, W, C] -> [B, C, T, H, W]
+        # 1D temporal attention.
+        x = rearrange(x, "B C t h w -> (B h w) t C")
+        qkv = self.qkv(x)
+        # Input: qkv with shape [B, t, 3 * num_heads * head_dim]
+        # Output: x with shape [B, num_heads, t, head_dim]
+        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, self.head_dim).transpose(1, 3).unbind(2)
+        if self.qk_norm:
+            q = F.normalize(q, p=2, dim=-1)
+            k = F.normalize(k, p=2, dim=-1)
+        x = optimized_attention(q, k, v, self.num_heads, skip_reshape=True)
+        assert x.size(0) == q.size(0)
+        x = self.out(x)
+        x = rearrange(x, "(B h w) t C -> B C t h w", B=B, h=H, w=W)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        **attn_kwargs,
+    ) -> None:
+        super().__init__()
+        self.norm = norm_fn(dim)
+        self.attn = Attention(dim, **attn_kwargs)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.attn(self.norm(x))
+class CausalUpsampleBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_res_blocks: int,
+        *,
+        temporal_expansion: int = 2,
+        spatial_expansion: int = 2,
+        **block_kwargs,
+    ):
+        super().__init__()
+        blocks = []
+        for _ in range(num_res_blocks):
+            blocks.append(block_fn(in_channels, **block_kwargs))
+        self.blocks = nn.Sequential(*blocks)
+        self.temporal_expansion = temporal_expansion
+        self.spatial_expansion = spatial_expansion
+        # Change channels in the final convolution layer.
+        self.proj = Conv1x1(
+            in_channels,
+            out_channels * temporal_expansion * (spatial_expansion**2),
+        )
+        self.d2st = DepthToSpaceTime(
+            temporal_expansion=temporal_expansion, spatial_expansion=spatial_expansion
+        )
+    def forward(self, x):
+        x = self.blocks(x)
+        x = self.proj(x)
+        x = self.d2st(x)
+        return x
+def block_fn(channels, *, affine: bool = True, has_attention: bool = False, **block_kwargs):
+    attn_block = AttentionBlock(channels) if has_attention else None
+    return ResBlock(channels, affine=affine, attn_block=attn_block, **block_kwargs)
+class DownsampleBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_res_blocks,
+        *,
+        temporal_reduction=2,
+        spatial_reduction=2,
+        **block_kwargs,
+    ):
+        """
+        Downsample block for the VAE encoder.
+        Args:
+            in_channels: Number of input channels.
+            out_channels: Number of output channels.
+            num_res_blocks: Number of residual blocks.
+            temporal_reduction: Temporal reduction factor.
+            spatial_reduction: Spatial reduction factor.
+        """
+        super().__init__()
+        layers = []
+        # Change the channel count in the strided convolution.
+        # This lets the ResBlock have uniform channel count,
+        # as in ConvNeXt.
+        assert in_channels != out_channels
+        layers.append(
+            PConv3d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(temporal_reduction, spatial_reduction, spatial_reduction),
+                stride=(temporal_reduction, spatial_reduction, spatial_reduction),
+                # First layer in each block always uses replicate padding
+                padding_mode="replicate",
+                bias=block_kwargs["bias"],
+            )
+        )
+        for _ in range(num_res_blocks):
+            layers.append(block_fn(out_channels, **block_kwargs))
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+def add_fourier_features(inputs: torch.Tensor, start=6, stop=8, step=1):
+    num_freqs = (stop - start) // step
+    assert inputs.ndim == 5
+    C = inputs.size(1)
+    # Create Base 2 Fourier features.
+    freqs = torch.arange(start, stop, step, dtype=inputs.dtype, device=inputs.device)
+    assert num_freqs == len(freqs)
+    w = torch.pow(2.0, freqs) * (2 * torch.pi)  # [num_freqs]
+    C = inputs.shape[1]
+    w = w.repeat(C)[None, :, None, None, None]  # [1, C * num_freqs, 1, 1, 1]
+    # Interleaved repeat of input channels to match w.
+    h = inputs.repeat_interleave(num_freqs, dim=1)  # [B, C * num_freqs, T, H, W]
+    # Scale channels by frequency.
+    h = w * h
+    return torch.cat(
+        [
+            inputs,
+            torch.sin(h),
+            torch.cos(h),
+        ],
+        dim=1,
+    )
+class FourierFeatures(nn.Module):
+    def __init__(self, start: int = 6, stop: int = 8, step: int = 1):
+        super().__init__()
+        self.start = start
+        self.stop = stop
+        self.step = step
+    def forward(self, inputs):
+        """Add Fourier features to inputs.
+        Args:
+            inputs: Input tensor. Shape: [B, C, T, H, W]
+        Returns:
+            h: Output tensor. Shape: [B, (1 + 2 * num_freqs) * C, T, H, W]
+        """
+        return add_fourier_features(inputs, self.start, self.stop, self.step)
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        out_channels: int = 3,
+        latent_dim: int,
+        base_channels: int,
+        channel_multipliers: List[int],
+        num_res_blocks: List[int],
+        temporal_expansions: Optional[List[int]] = None,
+        spatial_expansions: Optional[List[int]] = None,
+        has_attention: List[bool],
+        output_norm: bool = True,
+        nonlinearity: str = "silu",
+        output_nonlinearity: str = "silu",
+        causal: bool = True,
+        **block_kwargs,
+    ):
+        super().__init__()
+        self.input_channels = latent_dim
+        self.base_channels = base_channels
+        self.channel_multipliers = channel_multipliers
+        self.num_res_blocks = num_res_blocks
+        self.output_nonlinearity = output_nonlinearity
+        assert nonlinearity == "silu"
+        assert causal
+        ch = [mult * base_channels for mult in channel_multipliers]
+        self.num_up_blocks = len(ch) - 1
+        assert len(num_res_blocks) == self.num_up_blocks + 2
+        blocks = []
+        first_block = [
+            ops.Conv3d(latent_dim, ch[-1], kernel_size=(1, 1, 1))
+        ]  # Input layer.
+        # First set of blocks preserve channel count.
+        for _ in range(num_res_blocks[-1]):
+            first_block.append(
+                block_fn(
+                    ch[-1],
+                    has_attention=has_attention[-1],
+                    causal=causal,
+                    **block_kwargs,
+                )
+            )
+        blocks.append(nn.Sequential(*first_block))
+        assert len(temporal_expansions) == len(spatial_expansions) == self.num_up_blocks
+        assert len(num_res_blocks) == len(has_attention) == self.num_up_blocks + 2
+        upsample_block_fn = CausalUpsampleBlock
+        for i in range(self.num_up_blocks):
+            block = upsample_block_fn(
+                ch[-i - 1],
+                ch[-i - 2],
+                num_res_blocks=num_res_blocks[-i - 2],
+                has_attention=has_attention[-i - 2],
+                temporal_expansion=temporal_expansions[-i - 1],
+                spatial_expansion=spatial_expansions[-i - 1],
+                causal=causal,
+                **block_kwargs,
+            )
+            blocks.append(block)
+        assert not output_norm
+        # Last block. Preserve channel count.
+        last_block = []
+        for _ in range(num_res_blocks[0]):
+            last_block.append(
+                block_fn(
+                    ch[0], has_attention=has_attention[0], causal=causal, **block_kwargs
+                )
+            )
+        blocks.append(nn.Sequential(*last_block))
+        self.blocks = nn.ModuleList(blocks)
+        self.output_proj = Conv1x1(ch[0], out_channels)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x: Latent tensor. Shape: [B, input_channels, t, h, w]. Scaled [-1, 1].
+        Returns:
+            x: Reconstructed video tensor. Shape: [B, C, T, H, W]. Scaled to [-1, 1].
+               T + 1 = (t - 1) * 4.
+               H = h * 16, W = w * 16.
+        """
+        for block in self.blocks:
+            x = block(x)
+        if self.output_nonlinearity == "silu":
+            x = F.silu(x, inplace=not self.training)
+        else:
+            assert (
+                not self.output_nonlinearity
+            )  # StyleGAN3 omits the to-RGB nonlinearity.
+        return self.output_proj(x).contiguous()
+class LatentDistribution:
+    def __init__(self, mean: torch.Tensor, logvar: torch.Tensor):
+        """Initialize latent distribution.
+        Args:
+            mean: Mean of the distribution. Shape: [B, C, T, H, W].
+            logvar: Logarithm of variance of the distribution. Shape: [B, C, T, H, W].
+        """
+        assert mean.shape == logvar.shape
+        self.mean = mean
+        self.logvar = logvar
+    def sample(self, temperature=1.0, generator: torch.Generator = None, noise=None):
+        if temperature == 0.0:
+            return self.mean
+        if noise is None:
+            noise = torch.randn(self.mean.shape, device=self.mean.device, dtype=self.mean.dtype, generator=generator)
+        else:
+            assert noise.device == self.mean.device
+            noise = noise.to(self.mean.dtype)
+        if temperature != 1.0:
+            raise NotImplementedError(f"Temperature {temperature} is not supported.")
+        # Just Gaussian sample with no scaling of variance.
+        return noise * torch.exp(self.logvar * 0.5) + self.mean
+    def mode(self):
+        return self.mean
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        base_channels: int,
+        channel_multipliers: List[int],
+        num_res_blocks: List[int],
+        latent_dim: int,
+        temporal_reductions: List[int],
+        spatial_reductions: List[int],
+        prune_bottlenecks: List[bool],
+        has_attentions: List[bool],
+        affine: bool = True,
+        bias: bool = True,
+        input_is_conv_1x1: bool = False,
+        padding_mode: str,
+    ):
+        super().__init__()
+        self.temporal_reductions = temporal_reductions
+        self.spatial_reductions = spatial_reductions
+        self.base_channels = base_channels
+        self.channel_multipliers = channel_multipliers
+        self.num_res_blocks = num_res_blocks
+        self.latent_dim = latent_dim
+        self.fourier_features = FourierFeatures()
+        ch = [mult * base_channels for mult in channel_multipliers]
+        num_down_blocks = len(ch) - 1
+        assert len(num_res_blocks) == num_down_blocks + 2
+        layers = (
+            [ops.Conv3d(in_channels, ch[0], kernel_size=(1, 1, 1), bias=True)]
+            if not input_is_conv_1x1
+            else [Conv1x1(in_channels, ch[0])]
+        )
+        assert len(prune_bottlenecks) == num_down_blocks + 2
+        assert len(has_attentions) == num_down_blocks + 2
+        block = partial(block_fn, padding_mode=padding_mode, affine=affine, bias=bias)
+        for _ in range(num_res_blocks[0]):
+            layers.append(block(ch[0], has_attention=has_attentions[0], prune_bottleneck=prune_bottlenecks[0]))
+        prune_bottlenecks = prune_bottlenecks[1:]
+        has_attentions = has_attentions[1:]
+        assert len(temporal_reductions) == len(spatial_reductions) == len(ch) - 1
+        for i in range(num_down_blocks):
+            layer = DownsampleBlock(
+                ch[i],
+                ch[i + 1],
+                num_res_blocks=num_res_blocks[i + 1],
+                temporal_reduction=temporal_reductions[i],
+                spatial_reduction=spatial_reductions[i],
+                prune_bottleneck=prune_bottlenecks[i],
+                has_attention=has_attentions[i],
+                affine=affine,
+                bias=bias,
+                padding_mode=padding_mode,
+            )
+            layers.append(layer)
+        # Additional blocks.
+        for _ in range(num_res_blocks[-1]):
+            layers.append(block(ch[-1], has_attention=has_attentions[-1], prune_bottleneck=prune_bottlenecks[-1]))
+        self.layers = nn.Sequential(*layers)
+        # Output layers.
+        self.output_norm = norm_fn(ch[-1])
+        self.output_proj = Conv1x1(ch[-1], 2 * latent_dim, bias=False)
+    @property
+    def temporal_downsample(self):
+        return math.prod(self.temporal_reductions)
+    @property
+    def spatial_downsample(self):
+        return math.prod(self.spatial_reductions)
+    def forward(self, x) -> LatentDistribution:
+        """Forward pass.
+        Args:
+            x: Input video tensor. Shape: [B, C, T, H, W]. Scaled to [-1, 1]
+        Returns:
+            means: Latent tensor. Shape: [B, latent_dim, t, h, w]. Scaled [-1, 1].
+                   h = H // 8, w = W // 8, t - 1 = (T - 1) // 6
+            logvar: Shape: [B, latent_dim, t, h, w].
+        """
+        assert x.ndim == 5, f"Expected 5D input, got {x.shape}"
+        x = self.fourier_features(x)
+        x = self.layers(x)
+        x = self.output_norm(x)
+        x = F.silu(x, inplace=True)
+        x = self.output_proj(x)
+        means, logvar = torch.chunk(x, 2, dim=1)
+        assert means.ndim == 5
+        assert logvar.shape == means.shape
+        assert means.size(1) == self.latent_dim
+        return LatentDistribution(means, logvar)
+class VideoVAE(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.encoder = Encoder(
+            in_channels=15,
+            base_channels=64,
+            channel_multipliers=[1, 2, 4, 6],
+            num_res_blocks=[3, 3, 4, 6, 3],
+            latent_dim=12,
+            temporal_reductions=[1, 2, 3],
+            spatial_reductions=[2, 2, 2],
+            prune_bottlenecks=[False, False, False, False, False],
+            has_attentions=[False, True, True, True, True],
+            affine=True,
+            bias=True,
+            input_is_conv_1x1=True,
+            padding_mode="replicate"
+        )
+        self.decoder = Decoder(
+            out_channels=3,
+            base_channels=128,
+            channel_multipliers=[1, 2, 4, 6],
+            temporal_expansions=[1, 2, 3],
+            spatial_expansions=[2, 2, 2],
+            num_res_blocks=[3, 3, 4, 6, 3],
+            latent_dim=12,
+            has_attention=[False, False, False, False, False],
+            padding_mode="replicate",
+            output_norm=False,
+            nonlinearity="silu",
+            output_nonlinearity="silu",
+            causal=True,
+        )
+    def encode(self, x):
+        return self.encoder(x).mode()
+    def decode(self, x):
+        return self.decoder(x)

pixel_norm.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from torch import nn
+class PixelNorm(nn.Module):
+    def __init__(self, dim=1, eps=1e-8):
+        super(PixelNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+    def forward(self, x):
+        return x / torch.sqrt(torch.mean(x**2, dim=self.dim, keepdim=True) + self.eps)

put_taesd_encoder_pth_and_taesd_decoder_pth_here ADDED Viewed

File without changes

put_vae_here ADDED Viewed

File without changes

vae (1)/causal_conv3d.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+class CausalConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size: int = 3,
+        stride: Union[int, Tuple[int]] = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        **kwargs,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        kernel_size = (kernel_size, kernel_size, kernel_size)
+        self.time_kernel_size = kernel_size[0]
+        dilation = (dilation, 1, 1)
+        height_pad = kernel_size[1] // 2
+        width_pad = kernel_size[2] // 2
+        padding = (0, height_pad, width_pad)
+        self.conv = ops.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            padding_mode="zeros",
+            groups=groups,
+        )
+    def forward(self, x, causal: bool = True):
+        if causal:
+            first_frame_pad = x[:, :, :1, :, :].repeat(
+                (1, 1, self.time_kernel_size - 1, 1, 1)
+            )
+            x = torch.concatenate((first_frame_pad, x), dim=2)
+        else:
+            first_frame_pad = x[:, :, :1, :, :].repeat(
+                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
+            )
+            last_frame_pad = x[:, :, -1:, :, :].repeat(
+                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
+            )
+            x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2)
+        x = self.conv(x)
+        return x
+    @property
+    def weight(self):
+        return self.conv.weight

vae (1)/causal_video_autoencoder.py ADDED Viewed

	@@ -0,0 +1,907 @@

+import torch
+from torch import nn
+from functools import partial
+import math
+from einops import rearrange
+from typing import Optional, Tuple, Union
+from .conv_nd_factory import make_conv_nd, make_linear_nd
+from .pixel_norm import PixelNorm
+from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+class Encoder(nn.Module):
+    r"""
+    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+    Args:
+        dims (`int` or `Tuple[int, int]`, *optional*, defaults to 3):
+            The number of dimensions to use in convolutions.
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        blocks (`List[Tuple[str, int]]`, *optional*, defaults to `[("res_x", 1)]`):
+            The blocks to use. Each block is a tuple of the block name and the number of layers.
+        base_channels (`int`, *optional*, defaults to 128):
+            The number of output channels for the first convolutional layer.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        patch_size (`int`, *optional*, defaults to 1):
+            The patch size to use. Should be a power of 2.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        latent_log_var (`str`, *optional*, defaults to `per_channel`):
+            The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]] = 3,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        blocks=[("res_x", 1)],
+        base_channels: int = 128,
+        norm_num_groups: int = 32,
+        patch_size: Union[int, Tuple[int]] = 1,
+        norm_layer: str = "group_norm",  # group_norm, pixel_norm
+        latent_log_var: str = "per_channel",
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.norm_layer = norm_layer
+        self.latent_channels = out_channels
+        self.latent_log_var = latent_log_var
+        self.blocks_desc = blocks
+        in_channels = in_channels * patch_size**2
+        output_channel = base_channels
+        self.conv_in = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=output_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
+        self.down_blocks = nn.ModuleList([])
+        for block_name, block_params in blocks:
+            input_channel = output_channel
+            if isinstance(block_params, int):
+                block_params = {"num_layers": block_params}
+            if block_name == "res_x":
+                block = UNetMidBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_eps=1e-6,
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                )
+            elif block_name == "res_x_y":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = ResnetBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    eps=1e-6,
+                    groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                )
+            elif block_name == "compress_time":
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 1, 1),
+                    causal=True,
+                )
+            elif block_name == "compress_space":
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(1, 2, 2),
+                    causal=True,
+                )
+            elif block_name == "compress_all":
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 2, 2),
+                    causal=True,
+                )
+            elif block_name == "compress_all_x_y":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 2, 2),
+                    causal=True,
+                )
+            else:
+                raise ValueError(f"unknown block: {block_name}")
+            self.down_blocks.append(block)
+        # out
+        if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
+            )
+        elif norm_layer == "pixel_norm":
+            self.conv_norm_out = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        conv_out_channels = out_channels
+        if latent_log_var == "per_channel":
+            conv_out_channels *= 2
+        elif latent_log_var == "uniform":
+            conv_out_channels += 1
+        elif latent_log_var != "none":
+            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
+        self.conv_out = make_conv_nd(
+            dims, output_channel, conv_out_channels, 3, padding=1, causal=True
+        )
+        self.gradient_checkpointing = False
+    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `Encoder` class."""
+        sample = patchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
+        sample = self.conv_in(sample)
+        checkpoint_fn = (
+            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+            if self.gradient_checkpointing and self.training
+            else lambda x: x
+        )
+        for down_block in self.down_blocks:
+            sample = checkpoint_fn(down_block)(sample)
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if self.latent_log_var == "uniform":
+            last_channel = sample[:, -1:, ...]
+            num_dims = sample.dim()
+            if num_dims == 4:
+                # For shape (B, C, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1
+                )
+                sample = torch.cat([sample, repeated_last_channel], dim=1)
+            elif num_dims == 5:
+                # For shape (B, C, F, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1, 1
+                )
+                sample = torch.cat([sample, repeated_last_channel], dim=1)
+            else:
+                raise ValueError(f"Invalid input shape: {sample.shape}")
+        return sample
+class Decoder(nn.Module):
+    r"""
+    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+    Args:
+        dims (`int` or `Tuple[int, int]`, *optional*, defaults to 3):
+            The number of dimensions to use in convolutions.
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        blocks (`List[Tuple[str, int]]`, *optional*, defaults to `[("res_x", 1)]`):
+            The blocks to use. Each block is a tuple of the block name and the number of layers.
+        base_channels (`int`, *optional*, defaults to 128):
+            The number of output channels for the first convolutional layer.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        patch_size (`int`, *optional*, defaults to 1):
+            The patch size to use. Should be a power of 2.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        causal (`bool`, *optional*, defaults to `True`):
+            Whether to use causal convolutions or not.
+    """
+    def __init__(
+        self,
+        dims,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        blocks=[("res_x", 1)],
+        base_channels: int = 128,
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        patch_size: int = 1,
+        norm_layer: str = "group_norm",
+        causal: bool = True,
+        timestep_conditioning: bool = False,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.layers_per_block = layers_per_block
+        out_channels = out_channels * patch_size**2
+        self.causal = causal
+        self.blocks_desc = blocks
+        # Compute output channel to be product of all channel-multiplier blocks
+        output_channel = base_channels
+        for block_name, block_params in list(reversed(blocks)):
+            block_params = block_params if isinstance(block_params, dict) else {}
+            if block_name == "res_x_y":
+                output_channel = output_channel * block_params.get("multiplier", 2)
+            if block_name == "compress_all":
+                output_channel = output_channel * block_params.get("multiplier", 1)
+        self.conv_in = make_conv_nd(
+            dims,
+            in_channels,
+            output_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
+        self.up_blocks = nn.ModuleList([])
+        for block_name, block_params in list(reversed(blocks)):
+            input_channel = output_channel
+            if isinstance(block_params, int):
+                block_params = {"num_layers": block_params}
+            if block_name == "res_x":
+                block = UNetMidBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_eps=1e-6,
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=timestep_conditioning,
+                )
+            elif block_name == "attn_res_x":
+                block = UNetMidBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=timestep_conditioning,
+                    attention_head_dim=block_params["attention_head_dim"],
+                )
+            elif block_name == "res_x_y":
+                output_channel = output_channel // block_params.get("multiplier", 2)
+                block = ResnetBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    eps=1e-6,
+                    groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=False,
+                )
+            elif block_name == "compress_time":
+                block = DepthToSpaceUpsample(
+                    dims=dims, in_channels=input_channel, stride=(2, 1, 1)
+                )
+            elif block_name == "compress_space":
+                block = DepthToSpaceUpsample(
+                    dims=dims, in_channels=input_channel, stride=(1, 2, 2)
+                )
+            elif block_name == "compress_all":
+                output_channel = output_channel // block_params.get("multiplier", 1)
+                block = DepthToSpaceUpsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    stride=(2, 2, 2),
+                    residual=block_params.get("residual", False),
+                    out_channels_reduction_factor=block_params.get("multiplier", 1),
+                )
+            else:
+                raise ValueError(f"unknown layer: {block_name}")
+            self.up_blocks.append(block)
+        if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
+            )
+        elif norm_layer == "pixel_norm":
+            self.conv_norm_out = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = make_conv_nd(
+            dims, output_channel, out_channels, 3, padding=1, causal=True
+        )
+        self.gradient_checkpointing = False
+        self.timestep_conditioning = timestep_conditioning
+        if timestep_conditioning:
+            self.timestep_scale_multiplier = nn.Parameter(
+                torch.tensor(1000.0, dtype=torch.float32)
+            )
+            self.last_time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(
+                output_channel * 2, 0, operations=ops,
+            )
+            self.last_scale_shift_table = nn.Parameter(torch.empty(2, output_channel))
+    # def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor:
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `Decoder` class."""
+        batch_size = sample.shape[0]
+        sample = self.conv_in(sample, causal=self.causal)
+        checkpoint_fn = (
+            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+            if self.gradient_checkpointing and self.training
+            else lambda x: x
+        )
+        scaled_timestep = None
+        if self.timestep_conditioning:
+            assert (
+                timestep is not None
+            ), "should pass timestep with timestep_conditioning=True"
+            scaled_timestep = timestep * self.timestep_scale_multiplier.to(dtype=sample.dtype, device=sample.device)
+        for up_block in self.up_blocks:
+            if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D):
+                sample = checkpoint_fn(up_block)(
+                    sample, causal=self.causal, timestep=scaled_timestep
+                )
+            else:
+                sample = checkpoint_fn(up_block)(sample, causal=self.causal)
+        sample = self.conv_norm_out(sample)
+        if self.timestep_conditioning:
+            embedded_timestep = self.last_time_embedder(
+                timestep=scaled_timestep.flatten(),
+                resolution=None,
+                aspect_ratio=None,
+                batch_size=sample.shape[0],
+                hidden_dtype=sample.dtype,
+            )
+            embedded_timestep = embedded_timestep.view(
+                batch_size, embedded_timestep.shape[-1], 1, 1, 1
+            )
+            ada_values = self.last_scale_shift_table[
+                None, ..., None, None, None
+            ].to(device=sample.device, dtype=sample.dtype) + embedded_timestep.reshape(
+                batch_size,
+                2,
+                -1,
+                embedded_timestep.shape[-3],
+                embedded_timestep.shape[-2],
+                embedded_timestep.shape[-1],
+            )
+            shift, scale = ada_values.unbind(dim=1)
+            sample = sample * (1 + scale) + shift
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample, causal=self.causal)
+        sample = unpatchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
+        return sample
+class UNetMidBlock3D(nn.Module):
+    """
+    A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.
+    Args:
+        in_channels (`int`): The number of input channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_groups: int = 32,
+        norm_layer: str = "group_norm",
+        inject_noise: bool = False,
+        timestep_conditioning: bool = False,
+    ):
+        super().__init__()
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+        self.timestep_conditioning = timestep_conditioning
+        if timestep_conditioning:
+            self.time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(
+                in_channels * 4, 0, operations=ops,
+            )
+        self.res_blocks = nn.ModuleList(
+            [
+                ResnetBlock3D(
+                    dims=dims,
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    norm_layer=norm_layer,
+                    inject_noise=inject_noise,
+                    timestep_conditioning=timestep_conditioning,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    def forward(
+        self, hidden_states: torch.FloatTensor, causal: bool = True, timestep: Optional[torch.Tensor] = None
+    ) -> torch.FloatTensor:
+        timestep_embed = None
+        if self.timestep_conditioning:
+            assert (
+                timestep is not None
+            ), "should pass timestep with timestep_conditioning=True"
+            batch_size = hidden_states.shape[0]
+            timestep_embed = self.time_embedder(
+                timestep=timestep.flatten(),
+                resolution=None,
+                aspect_ratio=None,
+                batch_size=batch_size,
+                hidden_dtype=hidden_states.dtype,
+            )
+            timestep_embed = timestep_embed.view(
+                batch_size, timestep_embed.shape[-1], 1, 1, 1
+            )
+        for resnet in self.res_blocks:
+            hidden_states = resnet(hidden_states, causal=causal, timestep=timestep_embed)
+        return hidden_states
+class DepthToSpaceUpsample(nn.Module):
+    def __init__(
+        self, dims, in_channels, stride, residual=False, out_channels_reduction_factor=1
+    ):
+        super().__init__()
+        self.stride = stride
+        self.out_channels = (
+            math.prod(stride) * in_channels // out_channels_reduction_factor
+        )
+        self.conv = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            causal=True,
+        )
+        self.residual = residual
+        self.out_channels_reduction_factor = out_channels_reduction_factor
+    def forward(self, x, causal: bool = True, timestep: Optional[torch.Tensor] = None):
+        if self.residual:
+            # Reshape and duplicate the input to match the output shape
+            x_in = rearrange(
+                x,
+                "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+                p1=self.stride[0],
+                p2=self.stride[1],
+                p3=self.stride[2],
+            )
+            num_repeat = math.prod(self.stride) // self.out_channels_reduction_factor
+            x_in = x_in.repeat(1, num_repeat, 1, 1, 1)
+            if self.stride[0] == 2:
+                x_in = x_in[:, :, 1:, :, :]
+        x = self.conv(x, causal=causal)
+        x = rearrange(
+            x,
+            "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+            p1=self.stride[0],
+            p2=self.stride[1],
+            p3=self.stride[2],
+        )
+        if self.stride[0] == 2:
+            x = x[:, :, 1:, :, :]
+        if self.residual:
+            x = x + x_in
+        return x
+class LayerNorm(nn.Module):
+    def __init__(self, dim, eps, elementwise_affine=True) -> None:
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
+    def forward(self, x):
+        x = rearrange(x, "b c d h w -> b d h w c")
+        x = self.norm(x)
+        x = rearrange(x, "b d h w c -> b c d h w")
+        return x
+class ResnetBlock3D(nn.Module):
+    r"""
+    A Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        dropout: float = 0.0,
+        groups: int = 32,
+        eps: float = 1e-6,
+        norm_layer: str = "group_norm",
+        inject_noise: bool = False,
+        timestep_conditioning: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.inject_noise = inject_noise
+        if norm_layer == "group_norm":
+            self.norm1 = nn.GroupNorm(
+                num_groups=groups, num_channels=in_channels, eps=eps, affine=True
+            )
+        elif norm_layer == "pixel_norm":
+            self.norm1 = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.norm1 = LayerNorm(in_channels, eps=eps, elementwise_affine=True)
+        self.non_linearity = nn.SiLU()
+        self.conv1 = make_conv_nd(
+            dims,
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
+        if inject_noise:
+            self.per_channel_scale1 = nn.Parameter(torch.zeros((in_channels, 1, 1)))
+        if norm_layer == "group_norm":
+            self.norm2 = nn.GroupNorm(
+                num_groups=groups, num_channels=out_channels, eps=eps, affine=True
+            )
+        elif norm_layer == "pixel_norm":
+            self.norm2 = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.norm2 = LayerNorm(out_channels, eps=eps, elementwise_affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = make_conv_nd(
+            dims,
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+        )
+        if inject_noise:
+            self.per_channel_scale2 = nn.Parameter(torch.zeros((in_channels, 1, 1)))
+        self.conv_shortcut = (
+            make_linear_nd(
+                dims=dims, in_channels=in_channels, out_channels=out_channels
+            )
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+        self.norm3 = (
+            LayerNorm(in_channels, eps=eps, elementwise_affine=True)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+        self.timestep_conditioning = timestep_conditioning
+        if timestep_conditioning:
+            self.scale_shift_table = nn.Parameter(
+                torch.randn(4, in_channels) / in_channels**0.5
+            )
+    def _feed_spatial_noise(
+        self, hidden_states: torch.FloatTensor, per_channel_scale: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        spatial_shape = hidden_states.shape[-2:]
+        device = hidden_states.device
+        dtype = hidden_states.dtype
+        # similar to the "explicit noise inputs" method in style-gan
+        spatial_noise = torch.randn(spatial_shape, device=device, dtype=dtype)[None]
+        scaled_noise = (spatial_noise * per_channel_scale)[None, :, None, ...]
+        hidden_states = hidden_states + scaled_noise
+        return hidden_states
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        causal: bool = True,
+        timestep: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+        batch_size = hidden_states.shape[0]
+        hidden_states = self.norm1(hidden_states)
+        if self.timestep_conditioning:
+            assert (
+                timestep is not None
+            ), "should pass timestep with timestep_conditioning=True"
+            ada_values = self.scale_shift_table[
+                None, ..., None, None, None
+            ].to(device=hidden_states.device, dtype=hidden_states.dtype) + timestep.reshape(
+                batch_size,
+                4,
+                -1,
+                timestep.shape[-3],
+                timestep.shape[-2],
+                timestep.shape[-1],
+            )
+            shift1, scale1, shift2, scale2 = ada_values.unbind(dim=1)
+            hidden_states = hidden_states * (1 + scale1) + shift1
+        hidden_states = self.non_linearity(hidden_states)
+        hidden_states = self.conv1(hidden_states, causal=causal)
+        if self.inject_noise:
+            hidden_states = self._feed_spatial_noise(
+                hidden_states, self.per_channel_scale1.to(device=hidden_states.device, dtype=hidden_states.dtype)
+            )
+        hidden_states = self.norm2(hidden_states)
+        if self.timestep_conditioning:
+            hidden_states = hidden_states * (1 + scale2) + shift2
+        hidden_states = self.non_linearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states, causal=causal)
+        if self.inject_noise:
+            hidden_states = self._feed_spatial_noise(
+                hidden_states, self.per_channel_scale2.to(device=hidden_states.device, dtype=hidden_states.dtype)
+            )
+        input_tensor = self.norm3(input_tensor)
+        batch_size = input_tensor.shape[0]
+        input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = input_tensor + hidden_states
+        return output_tensor
+def patchify(x, patch_size_hw, patch_size_t=1):
+    if patch_size_hw == 1 and patch_size_t == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw
+        )
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b c (f p) (h q) (w r) -> b (c p r q) f h w",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    return x
+def unpatchify(x, patch_size_hw, patch_size_t=1):
+    if patch_size_hw == 1 and patch_size_t == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw
+        )
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b (c p r q) f h w -> b c (f p) (h q) (w r)",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
+    return x
+class processor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.register_buffer("std-of-means", torch.empty(128))
+        self.register_buffer("mean-of-means", torch.empty(128))
+        self.register_buffer("mean-of-stds", torch.empty(128))
+        self.register_buffer("mean-of-stds_over_std-of-means", torch.empty(128))
+        self.register_buffer("channel", torch.empty(128))
+    def un_normalize(self, x):
+        return (x * self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)) + self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)
+    def normalize(self, x):
+        return (x - self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)) / self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)
+class VideoVAE(nn.Module):
+    def __init__(self, version=0):
+        super().__init__()
+        if version == 0:
+            config = {
+                "_class_name": "CausalVideoAutoencoder",
+                "dims": 3,
+                "in_channels": 3,
+                "out_channels": 3,
+                "latent_channels": 128,
+                "blocks": [
+                    ["res_x", 4],
+                    ["compress_all", 1],
+                    ["res_x_y", 1],
+                    ["res_x", 3],
+                    ["compress_all", 1],
+                    ["res_x_y", 1],
+                    ["res_x", 3],
+                    ["compress_all", 1],
+                    ["res_x", 3],
+                    ["res_x", 4],
+                ],
+                "scaling_factor": 1.0,
+                "norm_layer": "pixel_norm",
+                "patch_size": 4,
+                "latent_log_var": "uniform",
+                "use_quant_conv": False,
+                "causal_decoder": False,
+            }
+        else:
+            config = {
+                "_class_name": "CausalVideoAutoencoder",
+                "dims": 3,
+                "in_channels": 3,
+                "out_channels": 3,
+                "latent_channels": 128,
+                "decoder_blocks": [
+                    ["res_x", {"num_layers": 5, "inject_noise": True}],
+                    ["compress_all", {"residual": True, "multiplier": 2}],
+                    ["res_x", {"num_layers": 6, "inject_noise": True}],
+                    ["compress_all", {"residual": True, "multiplier": 2}],
+                    ["res_x", {"num_layers": 7, "inject_noise": True}],
+                    ["compress_all", {"residual": True, "multiplier": 2}],
+                    ["res_x", {"num_layers": 8, "inject_noise": False}]
+                ],
+                "encoder_blocks": [
+                    ["res_x", {"num_layers": 4}],
+                    ["compress_all", {}],
+                    ["res_x_y", 1],
+                    ["res_x", {"num_layers": 3}],
+                    ["compress_all", {}],
+                    ["res_x_y", 1],
+                    ["res_x", {"num_layers": 3}],
+                    ["compress_all", {}],
+                    ["res_x", {"num_layers": 3}],
+                    ["res_x", {"num_layers": 4}]
+                ],
+                "scaling_factor": 1.0,
+                "norm_layer": "pixel_norm",
+                "patch_size": 4,
+                "latent_log_var": "uniform",
+                "use_quant_conv": False,
+                "causal_decoder": False,
+                "timestep_conditioning": True,
+            }
+        double_z = config.get("double_z", True)
+        latent_log_var = config.get(
+            "latent_log_var", "per_channel" if double_z else "none"
+        )
+        self.encoder = Encoder(
+            dims=config["dims"],
+            in_channels=config.get("in_channels", 3),
+            out_channels=config["latent_channels"],
+            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
+            patch_size=config.get("patch_size", 1),
+            latent_log_var=latent_log_var,
+            norm_layer=config.get("norm_layer", "group_norm"),
+        )
+        self.decoder = Decoder(
+            dims=config["dims"],
+            in_channels=config["latent_channels"],
+            out_channels=config.get("out_channels", 3),
+            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
+            patch_size=config.get("patch_size", 1),
+            norm_layer=config.get("norm_layer", "group_norm"),
+            causal=config.get("causal_decoder", False),
+            timestep_conditioning=config.get("timestep_conditioning", False),
+        )
+        self.timestep_conditioning = config.get("timestep_conditioning", False)
+        self.per_channel_statistics = processor()
+    def encode(self, x):
+        means, logvar = torch.chunk(self.encoder(x), 2, dim=1)
+        return self.per_channel_statistics.normalize(means)
+    def decode(self, x, timestep=0.05, noise_scale=0.025):
+        if self.timestep_conditioning: #TODO: seed
+            x = torch.randn_like(x) * noise_scale + (1.0 - noise_scale) * x
+        return self.decoder(self.per_channel_statistics.un_normalize(x), timestep=timestep)

vae (1)/conv_nd_factory.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from typing import Tuple, Union
+from .dual_conv3d import DualConv3d
+from .causal_conv3d import CausalConv3d
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+def make_conv_nd(
+    dims: Union[int, Tuple[int, int]],
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    bias=True,
+    causal=False,
+):
+    if dims == 2:
+        return ops.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+    elif dims == 3:
+        if causal:
+            return CausalConv3d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+            )
+        return ops.Conv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+    elif dims == (2, 1):
+        return DualConv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+    else:
+        raise ValueError(f"unsupported dimensions: {dims}")
+def make_linear_nd(
+    dims: int,
+    in_channels: int,
+    out_channels: int,
+    bias=True,
+):
+    if dims == 2:
+        return ops.Conv2d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias
+        )
+    elif dims == 3 or dims == (2, 1):
+        return ops.Conv3d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias
+        )
+    else:
+        raise ValueError(f"unsupported dimensions: {dims}")

vae (1)/dual_conv3d.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import math
+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+class DualConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        groups=1,
+        bias=True,
+    ):
+        super(DualConv3d, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        # Ensure kernel_size, stride, padding, and dilation are tuples of length 3
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size, kernel_size)
+        if kernel_size == (1, 1, 1):
+            raise ValueError(
+                "kernel_size must be greater than 1. Use make_linear_nd instead."
+            )
+        if isinstance(stride, int):
+            stride = (stride, stride, stride)
+        if isinstance(padding, int):
+            padding = (padding, padding, padding)
+        if isinstance(dilation, int):
+            dilation = (dilation, dilation, dilation)
+        # Set parameters for convolutions
+        self.groups = groups
+        self.bias = bias
+        # Define the size of the channels after the first convolution
+        intermediate_channels = (
+            out_channels if in_channels < out_channels else in_channels
+        )
+        # Define parameters for the first convolution
+        self.weight1 = nn.Parameter(
+            torch.Tensor(
+                intermediate_channels,
+                in_channels // groups,
+                1,
+                kernel_size[1],
+                kernel_size[2],
+            )
+        )
+        self.stride1 = (1, stride[1], stride[2])
+        self.padding1 = (0, padding[1], padding[2])
+        self.dilation1 = (1, dilation[1], dilation[2])
+        if bias:
+            self.bias1 = nn.Parameter(torch.Tensor(intermediate_channels))
+        else:
+            self.register_parameter("bias1", None)
+        # Define parameters for the second convolution
+        self.weight2 = nn.Parameter(
+            torch.Tensor(
+                out_channels, intermediate_channels // groups, kernel_size[0], 1, 1
+            )
+        )
+        self.stride2 = (stride[0], 1, 1)
+        self.padding2 = (padding[0], 0, 0)
+        self.dilation2 = (dilation[0], 1, 1)
+        if bias:
+            self.bias2 = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter("bias2", None)
+        # Initialize weights and biases
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.kaiming_uniform_(self.weight1, a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.weight2, a=math.sqrt(5))
+        if self.bias:
+            fan_in1, _ = nn.init._calculate_fan_in_and_fan_out(self.weight1)
+            bound1 = 1 / math.sqrt(fan_in1)
+            nn.init.uniform_(self.bias1, -bound1, bound1)
+            fan_in2, _ = nn.init._calculate_fan_in_and_fan_out(self.weight2)
+            bound2 = 1 / math.sqrt(fan_in2)
+            nn.init.uniform_(self.bias2, -bound2, bound2)
+    def forward(self, x, use_conv3d=False, skip_time_conv=False):
+        if use_conv3d:
+            return self.forward_with_3d(x=x, skip_time_conv=skip_time_conv)
+        else:
+            return self.forward_with_2d(x=x, skip_time_conv=skip_time_conv)
+    def forward_with_3d(self, x, skip_time_conv):
+        # First convolution
+        x = F.conv3d(
+            x,
+            self.weight1,
+            self.bias1,
+            self.stride1,
+            self.padding1,
+            self.dilation1,
+            self.groups,
+        )
+        if skip_time_conv:
+            return x
+        # Second convolution
+        x = F.conv3d(
+            x,
+            self.weight2,
+            self.bias2,
+            self.stride2,
+            self.padding2,
+            self.dilation2,
+            self.groups,
+        )
+        return x
+    def forward_with_2d(self, x, skip_time_conv):
+        b, c, d, h, w = x.shape
+        # First 2D convolution
+        x = rearrange(x, "b c d h w -> (b d) c h w")
+        # Squeeze the depth dimension out of weight1 since it's 1
+        weight1 = self.weight1.squeeze(2)
+        # Select stride, padding, and dilation for the 2D convolution
+        stride1 = (self.stride1[1], self.stride1[2])
+        padding1 = (self.padding1[1], self.padding1[2])
+        dilation1 = (self.dilation1[1], self.dilation1[2])
+        x = F.conv2d(x, weight1, self.bias1, stride1, padding1, dilation1, self.groups)
+        _, _, h, w = x.shape
+        if skip_time_conv:
+            x = rearrange(x, "(b d) c h w -> b c d h w", b=b)
+            return x
+        # Second convolution which is essentially treated as a 1D convolution across the 'd' dimension
+        x = rearrange(x, "(b d) c h w -> (b h w) c d", b=b)
+        # Reshape weight2 to match the expected dimensions for conv1d
+        weight2 = self.weight2.squeeze(-1).squeeze(-1)
+        # Use only the relevant dimension for stride, padding, and dilation for the 1D convolution
+        stride2 = self.stride2[0]
+        padding2 = self.padding2[0]
+        dilation2 = self.dilation2[0]
+        x = F.conv1d(x, weight2, self.bias2, stride2, padding2, dilation2, self.groups)
+        x = rearrange(x, "(b h w) c d -> b c d h w", b=b, h=h, w=w)
+        return x
+    @property
+    def weight(self):
+        return self.weight2
+def test_dual_conv3d_consistency():
+    # Initialize parameters
+    in_channels = 3
+    out_channels = 5
+    kernel_size = (3, 3, 3)
+    stride = (2, 2, 2)
+    padding = (1, 1, 1)
+    # Create an instance of the DualConv3d class
+    dual_conv3d = DualConv3d(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        bias=True,
+    )
+    # Example input tensor
+    test_input = torch.randn(1, 3, 10, 10, 10)
+    # Perform forward passes with both 3D and 2D settings
+    output_conv3d = dual_conv3d(test_input, use_conv3d=True)
+    output_2d = dual_conv3d(test_input, use_conv3d=False)
+    # Assert that the outputs from both methods are sufficiently close
+    assert torch.allclose(
+        output_conv3d, output_2d, atol=1e-6
+    ), "Outputs are not consistent between 3D and 2D convolutions."

vae (1)/pixel_norm.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from torch import nn
+class PixelNorm(nn.Module):
+    def __init__(self, dim=1, eps=1e-8):
+        super(PixelNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+    def forward(self, x):
+        return x / torch.sqrt(torch.mean(x**2, dim=self.dim, keepdim=True) + self.eps)

vae (2)/model.py ADDED Viewed

	@@ -0,0 +1,711 @@

+#original code from https://github.com/genmoai/models under apache 2.0 license
+#adapted to ComfyUI
+from typing import List, Optional, Tuple, Union
+from functools import partial
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from comfy.ldm.modules.attention import optimized_attention
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+# import mochi_preview.dit.joint_model.context_parallel as cp
+# from mochi_preview.vae.cp_conv import cp_pass_frames, gather_all_frames
+def cast_tuple(t, length=1):
+    return t if isinstance(t, tuple) else ((t,) * length)
+class GroupNormSpatial(ops.GroupNorm):
+    """
+    GroupNorm applied per-frame.
+    """
+    def forward(self, x: torch.Tensor, *, chunk_size: int = 8):
+        B, C, T, H, W = x.shape
+        x = rearrange(x, "B C T H W -> (B T) C H W")
+        # Run group norm in chunks.
+        output = torch.empty_like(x)
+        for b in range(0, B * T, chunk_size):
+            output[b : b + chunk_size] = super().forward(x[b : b + chunk_size])
+        return rearrange(output, "(B T) C H W -> B C T H W", B=B, T=T)
+class PConv3d(ops.Conv3d):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]],
+        causal: bool = True,
+        context_parallel: bool = True,
+        **kwargs,
+    ):
+        self.causal = causal
+        self.context_parallel = context_parallel
+        kernel_size = cast_tuple(kernel_size, 3)
+        stride = cast_tuple(stride, 3)
+        height_pad = (kernel_size[1] - 1) // 2
+        width_pad = (kernel_size[2] - 1) // 2
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=(1, 1, 1),
+            padding=(0, height_pad, width_pad),
+            **kwargs,
+        )
+    def forward(self, x: torch.Tensor):
+        # Compute padding amounts.
+        context_size = self.kernel_size[0] - 1
+        if self.causal:
+            pad_front = context_size
+            pad_back = 0
+        else:
+            pad_front = context_size // 2
+            pad_back = context_size - pad_front
+        # Apply padding.
+        assert self.padding_mode == "replicate"  # DEBUG
+        mode = "constant" if self.padding_mode == "zeros" else self.padding_mode
+        x = F.pad(x, (0, 0, 0, 0, pad_front, pad_back), mode=mode)
+        return super().forward(x)
+class Conv1x1(ops.Linear):
+    """*1x1 Conv implemented with a linear layer."""
+    def __init__(self, in_features: int, out_features: int, *args, **kwargs):
+        super().__init__(in_features, out_features, *args, **kwargs)
+    def forward(self, x: torch.Tensor):
+        """Forward pass.
+        Args:
+            x: Input tensor. Shape: [B, C, *] or [B, *, C].
+        Returns:
+            x: Output tensor. Shape: [B, C', *] or [B, *, C'].
+        """
+        x = x.movedim(1, -1)
+        x = super().forward(x)
+        x = x.movedim(-1, 1)
+        return x
+class DepthToSpaceTime(nn.Module):
+    def __init__(
+        self,
+        temporal_expansion: int,
+        spatial_expansion: int,
+    ):
+        super().__init__()
+        self.temporal_expansion = temporal_expansion
+        self.spatial_expansion = spatial_expansion
+    # When printed, this module should show the temporal and spatial expansion factors.
+    def extra_repr(self):
+        return f"texp={self.temporal_expansion}, sexp={self.spatial_expansion}"
+    def forward(self, x: torch.Tensor):
+        """Forward pass.
+        Args:
+            x: Input tensor. Shape: [B, C, T, H, W].
+        Returns:
+            x: Rearranged tensor. Shape: [B, C/(st*s*s), T*st, H*s, W*s].
+        """
+        x = rearrange(
+            x,
+            "B (C st sh sw) T H W -> B C (T st) (H sh) (W sw)",
+            st=self.temporal_expansion,
+            sh=self.spatial_expansion,
+            sw=self.spatial_expansion,
+        )
+        # cp_rank, _ = cp.get_cp_rank_size()
+        if self.temporal_expansion > 1: # and cp_rank == 0:
+            # Drop the first self.temporal_expansion - 1 frames.
+            # This is because we always want the 3x3x3 conv filter to only apply
+            # to the first frame, and the first frame doesn't need to be repeated.
+            assert all(x.shape)
+            x = x[:, :, self.temporal_expansion - 1 :]
+            assert all(x.shape)
+        return x
+def norm_fn(
+    in_channels: int,
+    affine: bool = True,
+):
+    return GroupNormSpatial(affine=affine, num_groups=32, num_channels=in_channels)
+class ResBlock(nn.Module):
+    """Residual block that preserves the spatial dimensions."""
+    def __init__(
+        self,
+        channels: int,
+        *,
+        affine: bool = True,
+        attn_block: Optional[nn.Module] = None,
+        causal: bool = True,
+        prune_bottleneck: bool = False,
+        padding_mode: str,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.channels = channels
+        assert causal
+        self.stack = nn.Sequential(
+            norm_fn(channels, affine=affine),
+            nn.SiLU(inplace=True),
+            PConv3d(
+                in_channels=channels,
+                out_channels=channels // 2 if prune_bottleneck else channels,
+                kernel_size=(3, 3, 3),
+                stride=(1, 1, 1),
+                padding_mode=padding_mode,
+                bias=bias,
+                causal=causal,
+            ),
+            norm_fn(channels, affine=affine),
+            nn.SiLU(inplace=True),
+            PConv3d(
+                in_channels=channels // 2 if prune_bottleneck else channels,
+                out_channels=channels,
+                kernel_size=(3, 3, 3),
+                stride=(1, 1, 1),
+                padding_mode=padding_mode,
+                bias=bias,
+                causal=causal,
+            ),
+        )
+        self.attn_block = attn_block if attn_block else nn.Identity()
+    def forward(self, x: torch.Tensor):
+        """Forward pass.
+        Args:
+            x: Input tensor. Shape: [B, C, T, H, W].
+        """
+        residual = x
+        x = self.stack(x)
+        x = x + residual
+        del residual
+        return self.attn_block(x)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        head_dim: int = 32,
+        qkv_bias: bool = False,
+        out_bias: bool = True,
+        qk_norm: bool = True,
+    ) -> None:
+        super().__init__()
+        self.head_dim = head_dim
+        self.num_heads = dim // head_dim
+        self.qk_norm = qk_norm
+        self.qkv = nn.Linear(dim, 3 * dim, bias=qkv_bias)
+        self.out = nn.Linear(dim, dim, bias=out_bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute temporal self-attention.
+        Args:
+            x: Input tensor. Shape: [B, C, T, H, W].
+            chunk_size: Chunk size for large tensors.
+        Returns:
+            x: Output tensor. Shape: [B, C, T, H, W].
+        """
+        B, _, T, H, W = x.shape
+        if T == 1:
+            # No attention for single frame.
+            x = x.movedim(1, -1)  # [B, C, T, H, W] -> [B, T, H, W, C]
+            qkv = self.qkv(x)
+            _, _, x = qkv.chunk(3, dim=-1)  # Throw away queries and keys.
+            x = self.out(x)
+            return x.movedim(-1, 1)  # [B, T, H, W, C] -> [B, C, T, H, W]
+        # 1D temporal attention.
+        x = rearrange(x, "B C t h w -> (B h w) t C")
+        qkv = self.qkv(x)
+        # Input: qkv with shape [B, t, 3 * num_heads * head_dim]
+        # Output: x with shape [B, num_heads, t, head_dim]
+        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, self.head_dim).transpose(1, 3).unbind(2)
+        if self.qk_norm:
+            q = F.normalize(q, p=2, dim=-1)
+            k = F.normalize(k, p=2, dim=-1)
+        x = optimized_attention(q, k, v, self.num_heads, skip_reshape=True)
+        assert x.size(0) == q.size(0)
+        x = self.out(x)
+        x = rearrange(x, "(B h w) t C -> B C t h w", B=B, h=H, w=W)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        **attn_kwargs,
+    ) -> None:
+        super().__init__()
+        self.norm = norm_fn(dim)
+        self.attn = Attention(dim, **attn_kwargs)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.attn(self.norm(x))
+class CausalUpsampleBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_res_blocks: int,
+        *,
+        temporal_expansion: int = 2,
+        spatial_expansion: int = 2,
+        **block_kwargs,
+    ):
+        super().__init__()
+        blocks = []
+        for _ in range(num_res_blocks):
+            blocks.append(block_fn(in_channels, **block_kwargs))
+        self.blocks = nn.Sequential(*blocks)
+        self.temporal_expansion = temporal_expansion
+        self.spatial_expansion = spatial_expansion
+        # Change channels in the final convolution layer.
+        self.proj = Conv1x1(
+            in_channels,
+            out_channels * temporal_expansion * (spatial_expansion**2),
+        )
+        self.d2st = DepthToSpaceTime(
+            temporal_expansion=temporal_expansion, spatial_expansion=spatial_expansion
+        )
+    def forward(self, x):
+        x = self.blocks(x)
+        x = self.proj(x)
+        x = self.d2st(x)
+        return x
+def block_fn(channels, *, affine: bool = True, has_attention: bool = False, **block_kwargs):
+    attn_block = AttentionBlock(channels) if has_attention else None
+    return ResBlock(channels, affine=affine, attn_block=attn_block, **block_kwargs)
+class DownsampleBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_res_blocks,
+        *,
+        temporal_reduction=2,
+        spatial_reduction=2,
+        **block_kwargs,
+    ):
+        """
+        Downsample block for the VAE encoder.
+        Args:
+            in_channels: Number of input channels.
+            out_channels: Number of output channels.
+            num_res_blocks: Number of residual blocks.
+            temporal_reduction: Temporal reduction factor.
+            spatial_reduction: Spatial reduction factor.
+        """
+        super().__init__()
+        layers = []
+        # Change the channel count in the strided convolution.
+        # This lets the ResBlock have uniform channel count,
+        # as in ConvNeXt.
+        assert in_channels != out_channels
+        layers.append(
+            PConv3d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(temporal_reduction, spatial_reduction, spatial_reduction),
+                stride=(temporal_reduction, spatial_reduction, spatial_reduction),
+                # First layer in each block always uses replicate padding
+                padding_mode="replicate",
+                bias=block_kwargs["bias"],
+            )
+        )
+        for _ in range(num_res_blocks):
+            layers.append(block_fn(out_channels, **block_kwargs))
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+def add_fourier_features(inputs: torch.Tensor, start=6, stop=8, step=1):
+    num_freqs = (stop - start) // step
+    assert inputs.ndim == 5
+    C = inputs.size(1)
+    # Create Base 2 Fourier features.
+    freqs = torch.arange(start, stop, step, dtype=inputs.dtype, device=inputs.device)
+    assert num_freqs == len(freqs)
+    w = torch.pow(2.0, freqs) * (2 * torch.pi)  # [num_freqs]
+    C = inputs.shape[1]
+    w = w.repeat(C)[None, :, None, None, None]  # [1, C * num_freqs, 1, 1, 1]
+    # Interleaved repeat of input channels to match w.
+    h = inputs.repeat_interleave(num_freqs, dim=1)  # [B, C * num_freqs, T, H, W]
+    # Scale channels by frequency.
+    h = w * h
+    return torch.cat(
+        [
+            inputs,
+            torch.sin(h),
+            torch.cos(h),
+        ],
+        dim=1,
+    )
+class FourierFeatures(nn.Module):
+    def __init__(self, start: int = 6, stop: int = 8, step: int = 1):
+        super().__init__()
+        self.start = start
+        self.stop = stop
+        self.step = step
+    def forward(self, inputs):
+        """Add Fourier features to inputs.
+        Args:
+            inputs: Input tensor. Shape: [B, C, T, H, W]
+        Returns:
+            h: Output tensor. Shape: [B, (1 + 2 * num_freqs) * C, T, H, W]
+        """
+        return add_fourier_features(inputs, self.start, self.stop, self.step)
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        out_channels: int = 3,
+        latent_dim: int,
+        base_channels: int,
+        channel_multipliers: List[int],
+        num_res_blocks: List[int],
+        temporal_expansions: Optional[List[int]] = None,
+        spatial_expansions: Optional[List[int]] = None,
+        has_attention: List[bool],
+        output_norm: bool = True,
+        nonlinearity: str = "silu",
+        output_nonlinearity: str = "silu",
+        causal: bool = True,
+        **block_kwargs,
+    ):
+        super().__init__()
+        self.input_channels = latent_dim
+        self.base_channels = base_channels
+        self.channel_multipliers = channel_multipliers
+        self.num_res_blocks = num_res_blocks
+        self.output_nonlinearity = output_nonlinearity
+        assert nonlinearity == "silu"
+        assert causal
+        ch = [mult * base_channels for mult in channel_multipliers]
+        self.num_up_blocks = len(ch) - 1
+        assert len(num_res_blocks) == self.num_up_blocks + 2
+        blocks = []
+        first_block = [
+            ops.Conv3d(latent_dim, ch[-1], kernel_size=(1, 1, 1))
+        ]  # Input layer.
+        # First set of blocks preserve channel count.
+        for _ in range(num_res_blocks[-1]):
+            first_block.append(
+                block_fn(
+                    ch[-1],
+                    has_attention=has_attention[-1],
+                    causal=causal,
+                    **block_kwargs,
+                )
+            )
+        blocks.append(nn.Sequential(*first_block))
+        assert len(temporal_expansions) == len(spatial_expansions) == self.num_up_blocks
+        assert len(num_res_blocks) == len(has_attention) == self.num_up_blocks + 2
+        upsample_block_fn = CausalUpsampleBlock
+        for i in range(self.num_up_blocks):
+            block = upsample_block_fn(
+                ch[-i - 1],
+                ch[-i - 2],
+                num_res_blocks=num_res_blocks[-i - 2],
+                has_attention=has_attention[-i - 2],
+                temporal_expansion=temporal_expansions[-i - 1],
+                spatial_expansion=spatial_expansions[-i - 1],
+                causal=causal,
+                **block_kwargs,
+            )
+            blocks.append(block)
+        assert not output_norm
+        # Last block. Preserve channel count.
+        last_block = []
+        for _ in range(num_res_blocks[0]):
+            last_block.append(
+                block_fn(
+                    ch[0], has_attention=has_attention[0], causal=causal, **block_kwargs
+                )
+            )
+        blocks.append(nn.Sequential(*last_block))
+        self.blocks = nn.ModuleList(blocks)
+        self.output_proj = Conv1x1(ch[0], out_channels)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x: Latent tensor. Shape: [B, input_channels, t, h, w]. Scaled [-1, 1].
+        Returns:
+            x: Reconstructed video tensor. Shape: [B, C, T, H, W]. Scaled to [-1, 1].
+               T + 1 = (t - 1) * 4.
+               H = h * 16, W = w * 16.
+        """
+        for block in self.blocks:
+            x = block(x)
+        if self.output_nonlinearity == "silu":
+            x = F.silu(x, inplace=not self.training)
+        else:
+            assert (
+                not self.output_nonlinearity
+            )  # StyleGAN3 omits the to-RGB nonlinearity.
+        return self.output_proj(x).contiguous()
+class LatentDistribution:
+    def __init__(self, mean: torch.Tensor, logvar: torch.Tensor):
+        """Initialize latent distribution.
+        Args:
+            mean: Mean of the distribution. Shape: [B, C, T, H, W].
+            logvar: Logarithm of variance of the distribution. Shape: [B, C, T, H, W].
+        """
+        assert mean.shape == logvar.shape
+        self.mean = mean
+        self.logvar = logvar
+    def sample(self, temperature=1.0, generator: torch.Generator = None, noise=None):
+        if temperature == 0.0:
+            return self.mean
+        if noise is None:
+            noise = torch.randn(self.mean.shape, device=self.mean.device, dtype=self.mean.dtype, generator=generator)
+        else:
+            assert noise.device == self.mean.device
+            noise = noise.to(self.mean.dtype)
+        if temperature != 1.0:
+            raise NotImplementedError(f"Temperature {temperature} is not supported.")
+        # Just Gaussian sample with no scaling of variance.
+        return noise * torch.exp(self.logvar * 0.5) + self.mean
+    def mode(self):
+        return self.mean
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        base_channels: int,
+        channel_multipliers: List[int],
+        num_res_blocks: List[int],
+        latent_dim: int,
+        temporal_reductions: List[int],
+        spatial_reductions: List[int],
+        prune_bottlenecks: List[bool],
+        has_attentions: List[bool],
+        affine: bool = True,
+        bias: bool = True,
+        input_is_conv_1x1: bool = False,
+        padding_mode: str,
+    ):
+        super().__init__()
+        self.temporal_reductions = temporal_reductions
+        self.spatial_reductions = spatial_reductions
+        self.base_channels = base_channels
+        self.channel_multipliers = channel_multipliers
+        self.num_res_blocks = num_res_blocks
+        self.latent_dim = latent_dim
+        self.fourier_features = FourierFeatures()
+        ch = [mult * base_channels for mult in channel_multipliers]
+        num_down_blocks = len(ch) - 1
+        assert len(num_res_blocks) == num_down_blocks + 2
+        layers = (
+            [ops.Conv3d(in_channels, ch[0], kernel_size=(1, 1, 1), bias=True)]
+            if not input_is_conv_1x1
+            else [Conv1x1(in_channels, ch[0])]
+        )
+        assert len(prune_bottlenecks) == num_down_blocks + 2
+        assert len(has_attentions) == num_down_blocks + 2
+        block = partial(block_fn, padding_mode=padding_mode, affine=affine, bias=bias)
+        for _ in range(num_res_blocks[0]):
+            layers.append(block(ch[0], has_attention=has_attentions[0], prune_bottleneck=prune_bottlenecks[0]))
+        prune_bottlenecks = prune_bottlenecks[1:]
+        has_attentions = has_attentions[1:]
+        assert len(temporal_reductions) == len(spatial_reductions) == len(ch) - 1
+        for i in range(num_down_blocks):
+            layer = DownsampleBlock(
+                ch[i],
+                ch[i + 1],
+                num_res_blocks=num_res_blocks[i + 1],
+                temporal_reduction=temporal_reductions[i],
+                spatial_reduction=spatial_reductions[i],
+                prune_bottleneck=prune_bottlenecks[i],
+                has_attention=has_attentions[i],
+                affine=affine,
+                bias=bias,
+                padding_mode=padding_mode,
+            )
+            layers.append(layer)
+        # Additional blocks.
+        for _ in range(num_res_blocks[-1]):
+            layers.append(block(ch[-1], has_attention=has_attentions[-1], prune_bottleneck=prune_bottlenecks[-1]))
+        self.layers = nn.Sequential(*layers)
+        # Output layers.
+        self.output_norm = norm_fn(ch[-1])
+        self.output_proj = Conv1x1(ch[-1], 2 * latent_dim, bias=False)
+    @property
+    def temporal_downsample(self):
+        return math.prod(self.temporal_reductions)
+    @property
+    def spatial_downsample(self):
+        return math.prod(self.spatial_reductions)
+    def forward(self, x) -> LatentDistribution:
+        """Forward pass.
+        Args:
+            x: Input video tensor. Shape: [B, C, T, H, W]. Scaled to [-1, 1]
+        Returns:
+            means: Latent tensor. Shape: [B, latent_dim, t, h, w]. Scaled [-1, 1].
+                   h = H // 8, w = W // 8, t - 1 = (T - 1) // 6
+            logvar: Shape: [B, latent_dim, t, h, w].
+        """
+        assert x.ndim == 5, f"Expected 5D input, got {x.shape}"
+        x = self.fourier_features(x)
+        x = self.layers(x)
+        x = self.output_norm(x)
+        x = F.silu(x, inplace=True)
+        x = self.output_proj(x)
+        means, logvar = torch.chunk(x, 2, dim=1)
+        assert means.ndim == 5
+        assert logvar.shape == means.shape
+        assert means.size(1) == self.latent_dim
+        return LatentDistribution(means, logvar)
+class VideoVAE(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.encoder = Encoder(
+            in_channels=15,
+            base_channels=64,
+            channel_multipliers=[1, 2, 4, 6],
+            num_res_blocks=[3, 3, 4, 6, 3],
+            latent_dim=12,
+            temporal_reductions=[1, 2, 3],
+            spatial_reductions=[2, 2, 2],
+            prune_bottlenecks=[False, False, False, False, False],
+            has_attentions=[False, True, True, True, True],
+            affine=True,
+            bias=True,
+            input_is_conv_1x1=True,
+            padding_mode="replicate"
+        )
+        self.decoder = Decoder(
+            out_channels=3,
+            base_channels=128,
+            channel_multipliers=[1, 2, 4, 6],
+            temporal_expansions=[1, 2, 3],
+            spatial_expansions=[2, 2, 2],
+            num_res_blocks=[3, 3, 4, 6, 3],
+            latent_dim=12,
+            has_attention=[False, False, False, False, False],
+            padding_mode="replicate",
+            output_norm=False,
+            nonlinearity="silu",
+            output_nonlinearity="silu",
+            causal=True,
+        )
+    def encode(self, x):
+        return self.encoder(x).mode()
+    def decode(self, x):
+        return self.decoder(x)

vae.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The causal continuous video tokenizer with VAE or AE formulation for 3D data.."""
+import logging
+import torch
+from torch import nn
+from enum import Enum
+import math
+from .cosmos_tokenizer.layers3d import (
+    EncoderFactorized,
+    DecoderFactorized,
+    CausalConv3d,
+)
+class IdentityDistribution(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, parameters):
+        return parameters, (torch.tensor([0.0]), torch.tensor([0.0]))
+class GaussianDistribution(torch.nn.Module):
+    def __init__(self, min_logvar: float = -30.0, max_logvar: float = 20.0):
+        super().__init__()
+        self.min_logvar = min_logvar
+        self.max_logvar = max_logvar
+    def sample(self, mean, logvar):
+        std = torch.exp(0.5 * logvar)
+        return mean + std * torch.randn_like(mean)
+    def forward(self, parameters):
+        mean, logvar = torch.chunk(parameters, 2, dim=1)
+        logvar = torch.clamp(logvar, self.min_logvar, self.max_logvar)
+        return self.sample(mean, logvar), (mean, logvar)
+class ContinuousFormulation(Enum):
+    VAE = GaussianDistribution
+    AE = IdentityDistribution
+class CausalContinuousVideoTokenizer(nn.Module):
+    def __init__(
+        self, z_channels: int, z_factor: int, latent_channels: int, **kwargs
+    ) -> None:
+        super().__init__()
+        self.name = kwargs.get("name", "CausalContinuousVideoTokenizer")
+        self.latent_channels = latent_channels
+        self.sigma_data = 0.5
+        # encoder_name = kwargs.get("encoder", Encoder3DType.BASE.name)
+        self.encoder = EncoderFactorized(
+            z_channels=z_factor * z_channels, **kwargs
+        )
+        if kwargs.get("temporal_compression", 4) == 4:
+            kwargs["channels_mult"] = [2, 4]
+        # decoder_name = kwargs.get("decoder", Decoder3DType.BASE.name)
+        self.decoder = DecoderFactorized(
+            z_channels=z_channels, **kwargs
+        )
+        self.quant_conv = CausalConv3d(
+            z_factor * z_channels,
+            z_factor * latent_channels,
+            kernel_size=1,
+            padding=0,
+        )
+        self.post_quant_conv = CausalConv3d(
+            latent_channels, z_channels, kernel_size=1, padding=0
+        )
+        # formulation_name = kwargs.get("formulation", ContinuousFormulation.AE.name)
+        self.distribution = IdentityDistribution()  # ContinuousFormulation[formulation_name].value()
+        num_parameters = sum(param.numel() for param in self.parameters())
+        logging.debug(f"model={self.name}, num_parameters={num_parameters:,}")
+        logging.debug(
+            f"z_channels={z_channels}, latent_channels={self.latent_channels}."
+        )
+        latent_temporal_chunk = 16
+        self.latent_mean = nn.Parameter(torch.zeros([self.latent_channels * latent_temporal_chunk], dtype=torch.float32))
+        self.latent_std = nn.Parameter(torch.ones([self.latent_channels * latent_temporal_chunk], dtype=torch.float32))
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        z, posteriors = self.distribution(moments)
+        latent_ch = z.shape[1]
+        latent_t = z.shape[2]
+        in_dtype = z.dtype
+        mean = self.latent_mean.view(latent_ch, -1)
+        std = self.latent_std.view(latent_ch, -1)
+        mean = mean.repeat(1, math.ceil(latent_t / mean.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        std = std.repeat(1, math.ceil(latent_t / std.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        return ((z - mean) / std) * self.sigma_data
+    def decode(self, z):
+        in_dtype = z.dtype
+        latent_ch = z.shape[1]
+        latent_t = z.shape[2]
+        mean = self.latent_mean.view(latent_ch, -1)
+        std = self.latent_std.view(latent_ch, -1)
+        mean = mean.repeat(1, math.ceil(latent_t / mean.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        std = std.repeat(1, math.ceil(latent_t / std.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        z = z / self.sigma_data
+        z = z * std + mean
+        z = self.post_quant_conv(z)
+        return self.decoder(z)

vae/put_vae_here ADDED Viewed

File without changes