Spaces:

ZhiyuanthePony
/

TriplaneTurbo

Running on Zero

App Files Files Community

ZhiyuanthePony commited on about 8 hours ago

Commit

f876753

1 Parent(s): e9a4e66

update

Browse files

Files changed (16) hide show

app.py +157 -4
example.py +99 -0
requirements.txt +26 -0
triplaneturbo_executable/__init__.py +9 -0
triplaneturbo_executable/extern/sd_dual_triplane_modules.py +981 -0
triplaneturbo_executable/models/geometry/__init__.py +1 -0
triplaneturbo_executable/models/geometry/sd_dual_triplanes.py +394 -0
triplaneturbo_executable/models/networks.py +83 -0
triplaneturbo_executable/pipelines/__init__.py +9 -0
triplaneturbo_executable/pipelines/base.py +33 -0
triplaneturbo_executable/pipelines/triplaneturbo_text_to_3d.py +344 -0
triplaneturbo_executable/utils/__init__.py +0 -0
triplaneturbo_executable/utils/general_utils.py +104 -0
triplaneturbo_executable/utils/mesh.py +288 -0
triplaneturbo_executable/utils/mesh_exporter.py +231 -0
triplaneturbo_executable/utils/saving.py +754 -0

app.py CHANGED Viewed

@@ -1,7 +1,160 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import os
+import torch
 import gradio as gr
+from typing import *
+from collections import deque
+from diffusers import StableDiffusionPipeline
+from triplaneturbo_executable import TriplaneTurboTextTo3DPipeline
+from triplaneturbo_executable.utils.mesh_exporter import export_obj
+# Initialize global variables
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+ADAPTER_PATH = "pretrained/triplane_turbo_sd_v1.pth"
+PIPELINE = None  # Will hold our pipeline instance
+OBJ_FILE_QUEUE = deque(maxlen=100)  # Queue to store OBJ file paths
+def download_model():
+    """Download the pretrained model if not exists"""
+    if not os.path.exists(ADAPTER_PATH):
+        print("Downloading pretrained models from huggingface")
+        os.system(
+            f"huggingface-cli download --resume-download ZhiyuanthePony/TriplaneTurbo \
+            --include \"triplane_turbo_sd_v1.pth\" \
+            --local-dir ./pretrained \
+            --local-dir-use-symlinks False"
+        )
+def initialize_pipeline():
+    """Initialize the pipeline once and keep it in memory"""
+    global PIPELINE
+    if PIPELINE is None:
+        print("Initializing pipeline...")
+        PIPELINE = TriplaneTurboTextTo3DPipeline.from_pretrained(ADAPTER_PATH)
+        PIPELINE.to(DEVICE)
+        print("Pipeline initialized!")
+    return PIPELINE
+def generate_3d_mesh(prompt: str) -> Tuple[str, str]:
+    """Generate 3D mesh from text prompt"""
+    global PIPELINE, OBJ_FILE_QUEUE
+    # Use the global pipeline instance
+    pipeline = initialize_pipeline()
+    # Use fixed seed value
+    seed = 42
+    # Generate mesh
+    output = pipeline(
+        prompt=prompt,
+        num_results_per_prompt=1,
+        generator=torch.Generator(device=DEVICE).manual_seed(seed),
+    )
+    # Save mesh
+    output_dir = "outputs"
+    os.makedirs(output_dir, exist_ok=True)
+    mesh_path = None
+    for i, mesh in enumerate(output["mesh"]):
+        vertices = mesh.v_pos
+        # 1. First rotate -90 degrees around X-axis to make the model face up
+        vertices = torch.stack([
+            vertices[:, 0],           # x remains unchanged
+            vertices[:, 2],           # y = z
+            -vertices[:, 1]           # z = -y
+        ], dim=1)
+        # 2. Then rotate 90 degrees around Y-axis to make the model face the observer
+        vertices = torch.stack([
+            -vertices[:, 2],          # x = -z
+            vertices[:, 1],           # y remains unchanged
+            vertices[:, 0]            # z = x
+        ], dim=1)
+        mesh.v_pos = vertices
+        # If mesh has normals, they need to be rotated in the same way
+        if mesh.v_nrm is not None:
+            normals = mesh.v_nrm
+            # 1. Rotate -90 degrees around X-axis
+            normals = torch.stack([
+                normals[:, 0],
+                normals[:, 2],
+                -normals[:, 1]
+            ], dim=1)
+            # 2. Rotate 90 degrees around Y-axis
+            normals = torch.stack([
+                -normals[:, 2],
+                normals[:, 1],
+                normals[:, 0]
+            ], dim=1)
+            mesh._v_nrm = normals
+        name = f"{prompt.replace(' ', '_')}"
+        save_paths = export_obj(mesh, f"{output_dir}/{name}.obj")
+        mesh_path = save_paths[0]
+        # Add new file path to queue
+        OBJ_FILE_QUEUE.append(mesh_path)
+        # If queue is at max length, remove oldest file
+        if len(OBJ_FILE_QUEUE) == OBJ_FILE_QUEUE.maxlen:
+            old_file = OBJ_FILE_QUEUE[0]  # Get oldest file (will be automatically removed from queue)
+            if os.path.exists(old_file):
+                try:
+                    os.remove(old_file)
+                except OSError as e:
+                    print(f"Error deleting file {old_file}: {e}")
+    return mesh_path, mesh_path  # Return the path twice - once for 3D preview, once for download
+def main():
+    # Download model if needed
+    download_model()
+    # Initialize pipeline at startup
+    initialize_pipeline()
+    # Create Gradio interface
+    iface = gr.Interface(
+        fn=generate_3d_mesh,
+        inputs=[
+            gr.Textbox(
+                label="Text Prompt",
+                placeholder="Enter your text description...",
+                value="Armor dress style of outsiderzone fantasy helmet"
+            )
+        ],
+        outputs=[
+            gr.Model3D(
+                label="Generated 3D Mesh",
+                camera_position=(90, 90, 3),
+                clear_color=(0.5, 0.5, 0.5, 1),
+            ),
+            gr.File(label="Download OBJ file")
+        ],
+        title="Text to 3D Mesh Generation with TriplaneTurbo",
+        description="Demo of the paper Progressive Rendering Distillation: Adapting Stable Diffusion for Instant Text-to-Mesh Generation beyond 3D Training Data [CVPR 2025] <br><a href='https://github.com/theEricMa/TriplaneTurbo' style='color: #2196F3;'>https://github.com/theEricMa/TriplaneTurbo</a>",
+        examples=[
+            ["Armor dress style of outsiderzone fantasy helmet"],
+            ["Gandalf the grey riding a camel in a rock concert, victorian newspaper article, hyperrealistic"],
+            ["A DSLR photo of a bald eagle"],
+            ["A goblin riding a lawnmower in a hospital, victorian newspaper article, 4k hd"],
+            ["An imperial stormtrooper, highly detailed"],
+        ],
+        allow_flagging="never",
+    )
+    # Launch the interface
+    iface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        show_error=True,
+    )
+if __name__ == "__main__":
+    main()

example.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+import torch
+import argparse
+from typing import *
+from diffusers import StableDiffusionPipeline
+from collections import deque
+from triplaneturbo_executable.utils.mesh_exporter import export_obj
+from triplaneturbo_executable import TriplaneTurboTextTo3DPipeline, TriplaneTurboTextTo3DPipelineConfig
+# Initialize configuration and parameters
+prompt = "a beautiful girl"
+output_dir = "examples/output"
+adapter_name_or_path = "pretrained/triplane_turbo_sd_v1.pth"
+num_results_per_prompt = 1
+seed = 42
+device = "cuda"
+max_obj_files = 100
+# download pretrained models if not exist
+if not os.path.exists(adapter_name_or_path):
+    print(f"Downloading pretrained models from huggingface")
+    os.system(
+        f"huggingface-cli download --resume-download ZhiyuanthePony/TriplaneTurbo \
+        --include \"triplane_turbo_sd_v1.pth\" \
+        --local-dir ./pretrained \
+        --local-dir-use-symlinks False"
+        )
+# Initialize the TriplaneTurbo pipeline
+triplane_turbo_pipeline = TriplaneTurboTextTo3DPipeline.from_pretrained(adapter_name_or_path)
+triplane_turbo_pipeline.to(device)
+# Run the pipeline
+output = triplane_turbo_pipeline(
+    prompt=prompt,
+    num_results_per_prompt=num_results_per_prompt,
+    generator=torch.Generator(device=device).manual_seed(seed),
+    device=device,
+)
+# Initialize a deque with maximum length of 100 to store obj file paths
+obj_file_queue = deque(maxlen=max_obj_files)
+# Save mesh
+os.makedirs(output_dir, exist_ok=True)
+for i, mesh in enumerate(output["mesh"]):
+    vertices = mesh.v_pos
+    # 1. First rotate -90 degrees around X-axis to make the model face up
+    vertices = torch.stack([
+        vertices[:, 0],           # x remains unchanged
+        vertices[:, 2],           # y = z
+        -vertices[:, 1]           # z = -y
+    ], dim=1)
+    # 2. Then rotate 90 degrees around Y-axis to make the model face the observer
+    vertices = torch.stack([
+        -vertices[:, 2],          # x = -z
+        vertices[:, 1],           # y remains unchanged
+        vertices[:, 0]            # z = x
+    ], dim=1)
+    mesh.v_pos = vertices
+    # If mesh has normals, they need to be rotated in the same way
+    if mesh.v_nrm is not None:
+        normals = mesh.v_nrm
+        # 1. Rotate -90 degrees around X-axis
+        normals = torch.stack([
+            normals[:, 0],
+            normals[:, 2],
+            -normals[:, 1]
+        ], dim=1)
+        # 2. Rotate 90 degrees around Y-axis
+        normals = torch.stack([
+            -normals[:, 2],
+            normals[:, 1],
+            normals[:, 0]
+        ], dim=1)
+        mesh._v_nrm = normals
+    # Save obj file and add its path to the queue
+    name = f"{prompt.replace(' ', '_')}_{seed}_{i}"
+    save_paths = export_obj(mesh, f"{output_dir}/{name}.obj")
+    obj_file_queue.append(save_paths[0])
+    # If an old file needs to be removed (queue is at max length)
+    # and the file exists, delete it
+    if len(obj_file_queue) == max_obj_files and os.path.exists(obj_file_queue[0]):
+        old_file = obj_file_queue[0]
+        try:
+            os.remove(old_file)
+        except OSError as e:
+            print(f"Error deleting file {old_file}: {e}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+omegaconf==2.3.0
+jaxtyping
+typeguard
+diffusers==0.25
+transformers==4.28.1
+accelerate
+imageio>=2.28.0
+imageio[ffmpeg]
+git+https://github.com/NVlabs/nvdiffrast.git
+libigl
+trimesh[easy]
+networkx
+pysdf
+PyMCubes
+wandb
+torchmetrics
+huggingface_hub==0.24.7
+numpy==1.26.4
+gradio==2.9.4
+# # 3d gaussian
+# plyfile
+# diffmc
+diso
+einops

triplaneturbo_executable/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .pipelines.triplaneturbo_text_to_3d import (
+    TriplaneTurboTextTo3DPipeline,
+    TriplaneTurboTextTo3DPipelineConfig
+)
+__all__ = [
+    "TriplaneTurboTextTo3DPipeline",
+    "TriplaneTurboTextTo3DPipelineConfig"
+]

triplaneturbo_executable/extern/sd_dual_triplane_modules.py ADDED Viewed

	@@ -0,0 +1,981 @@

+import re
+import torch
+import torch.nn as nn
+from dataclasses import dataclass
+from typing import Optional, Union, Tuple
+from diffusers.models.attention_processor import Attention
+from diffusers import (
+    DDPMScheduler,
+    UNet2DConditionModel,
+    AutoencoderKL
+)
+from diffusers.loaders import AttnProcsLayers
+class LoRALinearLayerwBias(nn.Module):
+    r"""
+    A linear layer that is used with LoRA, can be used with bias.
+    Parameters:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
+        rank (`int`, `optional`, defaults to 4):
+            The rank of the LoRA layer.
+        network_alpha (`float`, `optional`, defaults to `None`):
+            The value of the network alpha used for stable learning and preventing underflow. This value has the same
+            meaning as the `--network_alpha` option in the kohya-ss trainer script. See
+            https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        device (`torch.device`, `optional`, defaults to `None`):
+            The device to use for the layer's weights.
+        dtype (`torch.dtype`, `optional`, defaults to `None`):
+            The dtype to use for the layer's weights.
+    """
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        network_alpha: Optional[float] = None,
+        device: Optional[Union[torch.device, str]] = None,
+        dtype: Optional[torch.dtype] = None,
+        with_bias: bool = False
+    ):
+        super().__init__()
+        self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
+        self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
+        if with_bias:
+            self.bias = nn.Parameter(torch.zeros([1, 1, out_features], device=device, dtype=dtype))
+        self.with_bias = with_bias
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+        self.out_features = out_features
+        self.in_features = in_features
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+        if self.with_bias:
+            up_hidden_states = up_hidden_states + self.bias
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+        return up_hidden_states.to(orig_dtype)
+class TriplaneLoRAConv2dLayer(nn.Module):
+    r"""
+    A convolutional layer that is used with LoRA.
+    Parameters:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
+        rank (`int`, `optional`, defaults to 4):
+            The rank of the LoRA layer.
+        kernel_size (`int` or `tuple` of two `int`, `optional`, defaults to 1):
+            The kernel size of the convolution.
+        stride (`int` or `tuple` of two `int`, `optional`, defaults to 1):
+            The stride of the convolution.
+        padding (`int` or `tuple` of two `int` or `str`, `optional`, defaults to 0):
+            The padding of the convolution.
+        network_alpha (`float`, `optional`, defaults to `None`):
+            The value of the network alpha used for stable learning and preventing underflow. This value has the same
+            meaning as the `--network_alpha` option in the kohya-ss trainer script. See
+            https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+    """
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        kernel_size: Union[int, Tuple[int, int]] = (1, 1),
+        stride: Union[int, Tuple[int, int]] = (1, 1),
+        padding: Union[int, Tuple[int, int], str] = 0,
+        network_alpha: Optional[float] = None,
+        with_bias: bool = False,
+        locon_type: str = "hexa_v1", #hexa_v2, vanilla_v1, vanilla_v2
+    ):
+        super().__init__()
+        assert locon_type in ["hexa_v1", "hexa_v2", "vanilla_v1", "vanilla_v2"], "The LoCON type is not supported."
+        if locon_type == "hexa_v1":
+            self.down_xy_geo = nn.Conv2d(in_features, rank, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
+            self.down_xz_geo = nn.Conv2d(in_features, rank, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
+            self.down_yz_geo = nn.Conv2d(in_features, rank, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
+            self.down_xy_tex = nn.Conv2d(in_features, rank, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
+            self.down_xz_tex = nn.Conv2d(in_features, rank, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
+            self.down_yz_tex = nn.Conv2d(in_features, rank, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
+            # according to the official kohya_ss trainer kernel_size are always fixed for the up layer
+            # # see: https://github.com/bmaltais/kohya_ss/blob/2accb1305979ba62f5077a23aabac23b4c37e935/networks/lora_diffusers.py#L129
+            self.up_xy_geo = nn.Conv2d(rank, out_features, kernel_size=(1, 1), stride=(1, 1), bias=with_bias)
+            self.up_xz_geo = nn.Conv2d(rank, out_features, kernel_size=(1, 1), stride=(1, 1), bias=with_bias)
+            self.up_yz_geo = nn.Conv2d(rank, out_features, kernel_size=(1, 1), stride=(1, 1), bias=with_bias)
+            self.up_xy_tex = nn.Conv2d(rank, out_features, kernel_size=(1, 1), stride=(1, 1), bias=with_bias)
+            self.up_xz_tex = nn.Conv2d(rank, out_features, kernel_size=(1, 1), stride=(1, 1), bias=with_bias)
+            self.up_yz_tex = nn.Conv2d(rank, out_features, kernel_size=(1, 1), stride=(1, 1), bias=with_bias)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        elif locon_type == "hexa_v2":
+            self.down_xy_geo = nn.Conv2d(in_features, rank, kernel_size=(1, 1), stride=(1, 1),padding=padding, bias=False)
+            self.down_xz_geo = nn.Conv2d(in_features, rank, kernel_size=(1, 1), stride=(1, 1),padding=padding, bias=False)
+            self.down_yz_geo = nn.Conv2d(in_features, rank, kernel_size=(1, 1), stride=(1, 1),padding=padding, bias=False)
+            self.down_xy_tex = nn.Conv2d(in_features, rank, kernel_size=(1, 1), stride=(1, 1),padding=padding, bias=False)
+            self.down_xz_tex = nn.Conv2d(in_features, rank, kernel_size=(1, 1), stride=(1, 1),padding=padding, bias=False)
+            self.down_yz_tex = nn.Conv2d(in_features, rank, kernel_size=(1, 1), stride=(1, 1),padding=padding, bias=False)
+            self.up_xy_geo = nn.Conv2d(rank, out_features, kernel_size=kernel_size, stride=stride, bias=with_bias)
+            self.up_xz_geo = nn.Conv2d(rank, out_features, kernel_size=kernel_size, stride=stride, bias=with_bias)
+            self.up_yz_geo = nn.Conv2d(rank, out_features, kernel_size=kernel_size, stride=stride, bias=with_bias)
+            self.up_xy_tex = nn.Conv2d(rank, out_features, kernel_size=kernel_size, stride=stride, bias=with_bias)
+            self.up_xz_tex = nn.Conv2d(rank, out_features, kernel_size=kernel_size, stride=stride, bias=with_bias)
+            self.up_yz_tex = nn.Conv2d(rank, out_features, kernel_size=kernel_size, stride=stride, bias=with_bias)
+        elif locon_type == "vanilla_v1":
+            self.down = nn.Conv2d(in_features, rank, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
+            self.up = nn.Conv2d(rank, out_features, kernel_size=(1, 1), stride=(1, 1), bias=with_bias)
+        elif locon_type == "vanilla_v2":
+            self.down = nn.Conv2d(in_features, rank, kernel_size=(1, 1), stride=(1, 1), padding=padding, bias=False)
+            self.up = nn.Conv2d(rank, out_features, kernel_size=kernel_size, stride=stride, bias=with_bias)
+        self.network_alpha = network_alpha
+        self.rank = rank
+        self.locon_type = locon_type
+        self._init_weights()
+    def _init_weights(self):
+        for layer in [
+            "down_xy_geo", "down_xz_geo", "down_yz_geo", "down_xy_tex", "down_xz_tex", "down_yz_tex", # in case of hexa_vX
+            "up_xy", "up_xz", "up_yz", "up_xy_tex", "up_xz_tex", "up_yz_tex", # in case of hexa_vX
+            "down", "up" # in case of vanilla
+        ]:
+            if hasattr(self, layer):
+                # initialize the weights
+                if "down" in layer:
+                    nn.init.normal_(getattr(self, layer).weight, std=1 / self.rank)
+                elif "up" in layer:
+                    nn.init.zeros_(getattr(self, layer).weight)
+                # initialize the bias
+                if getattr(self, layer).bias is not None:
+                    nn.init.zeros_(getattr(self, layer).bias)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down_xy_geo.weight.dtype if "hexa" in self.locon_type else self.down.weight.dtype
+        if "hexa" in self.locon_type:
+            # xy plane
+            hidden_states_xy_geo = self.up_xy_geo(self.down_xy_geo(hidden_states[0::6].to(dtype)))
+            hidden_states_xy_tex = self.up_xy_tex(self.down_xy_tex(hidden_states[3::6].to(dtype)))
+            lora_hidden_states = torch.concat(
+                [torch.zeros_like(hidden_states_xy_tex)] * 6,
+                dim=0
+            )
+            lora_hidden_states[0::6] = hidden_states_xy_geo
+            lora_hidden_states[3::6] = hidden_states_xy_tex
+            # xz plane
+            lora_hidden_states[1::6] = self.up_xz_geo(self.down_xz_geo(hidden_states[1::6].to(dtype)))
+            lora_hidden_states[4::6] = self.up_xz_tex(self.down_xz_tex(hidden_states[4::6].to(dtype)))
+            # yz plane
+            lora_hidden_states[2::6] = self.up_yz_geo(self.down_yz_geo(hidden_states[2::6].to(dtype)))
+            lora_hidden_states[5::6] = self.up_yz_tex(self.down_yz_tex(hidden_states[5::6].to(dtype)))
+        elif "vanilla" in self.locon_type:
+            lora_hidden_states = self.up(self.down(hidden_states.to(dtype)))
+        if self.network_alpha is not None:
+            lora_hidden_states *= self.network_alpha / self.rank
+        return lora_hidden_states.to(orig_dtype)
+class TriplaneSelfAttentionLoRAAttnProcessor(nn.Module):
+    """
+    Perform for implementing the Triplane Self-Attention LoRA Attention Processor.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        rank: int = 4,
+        network_alpha: Optional[float] = None,
+        with_bias: bool = False,
+        lora_type: str = "hexa_v1", # vanilla,
+    ):
+        super().__init__()
+        assert lora_type in ["hexa_v1", "vanilla", "none", "basic"], "The LoRA type is not supported."
+        self.hidden_size = hidden_size
+        self.rank = rank
+        self.lora_type = lora_type
+        if lora_type in ["hexa_v1"]:
+            # lora for 1st plane geometry
+            self.to_q_xy_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_xy_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_xy_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_xy_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            # lora for 1st plane texture
+            self.to_q_xy_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_xy_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_xy_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_xy_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            # lora for 2nd plane geometry
+            self.to_q_xz_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_xz_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_xz_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_xz_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            # lora for 2nd plane texture
+            self.to_q_xz_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_xz_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_xz_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_xz_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            # lora for 3nd plane geometry
+            self.to_q_yz_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_yz_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_yz_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_yz_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            # lora for 3nd plane texture
+            self.to_q_yz_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_yz_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_yz_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_yz_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+        elif lora_type in ["vanilla", "basic"]:
+            self.to_q_lora = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_lora = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_lora = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_lora = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+    def __call__(
+        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None
+    ):
+        assert encoder_hidden_states is None, "The encoder_hidden_states should be None."
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        ############################################################################################################
+        # query
+        if self.lora_type in ["hexa_v1",]:
+            query = attn.to_q(hidden_states)
+            _query_new = torch.zeros_like(query)
+            # lora for xy plane geometry
+            _query_new[0::6] = self.to_q_xy_lora_geo(hidden_states[0::6])
+            # lora for xy plane texture
+            _query_new[3::6] = self.to_q_xy_lora_tex(hidden_states[3::6])
+            # lora for xz plane geometry
+            _query_new[1::6] = self.to_q_xz_lora_geo(hidden_states[1::6])
+            # lora for xz plane texture
+            _query_new[4::6] = self.to_q_xz_lora_tex(hidden_states[4::6])
+            # lora for yz plane geometry
+            _query_new[2::6] = self.to_q_yz_lora_geo(hidden_states[2::6])
+            # lora for yz plane texture
+            _query_new[5::6] = self.to_q_yz_lora_tex(hidden_states[5::6])
+            query = query + scale * _query_new
+            # # speed up inference
+            # query[0::6] += self.to_q_xy_lora_geo(hidden_states[0::6]) * scale
+            # query[3::6] += self.to_q_xy_lora_tex(hidden_states[3::6]) * scale
+            # query[1::6] += self.to_q_xz_lora_geo(hidden_states[1::6]) * scale
+            # query[4::6] += self.to_q_xz_lora_tex(hidden_states[4::6]) * scale
+            # query[2::6] += self.to_q_yz_lora_geo(hidden_states[2::6]) * scale
+            # query[5::6] += self.to_q_yz_lora_tex(hidden_states[5::6]) * scale
+        elif self.lora_type in ["vanilla", "basic"]:
+            query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        elif self.lora_type in ["none"]:
+            query = attn.to_q(hidden_states)
+        else:
+            raise NotImplementedError("The LoRA type is not supported for the query in HplaneSelfAttentionLoRAAttnProcessor.")
+        ############################################################################################################
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        ############################################################################################################
+        # key and value
+        if self.lora_type in ["hexa_v1",]:
+            key = attn.to_k(encoder_hidden_states)
+            _key_new = torch.zeros_like(key)
+            # lora for xy plane geometry
+            _key_new[0::6] = self.to_k_xy_lora_geo(encoder_hidden_states[0::6])
+            # lora for xy plane texture
+            _key_new[3::6] = self.to_k_xy_lora_tex(encoder_hidden_states[3::6])
+            # lora for xz plane geometry
+            _key_new[1::6] = self.to_k_xz_lora_geo(encoder_hidden_states[1::6])
+            # lora for xz plane texture
+            _key_new[4::6] = self.to_k_xz_lora_tex(encoder_hidden_states[4::6])
+            # lora for yz plane geometry
+            _key_new[2::6] = self.to_k_yz_lora_geo(encoder_hidden_states[2::6])
+            # lora for yz plane texture
+            _key_new[5::6] = self.to_k_yz_lora_tex(encoder_hidden_states[5::6])
+            key = key + scale * _key_new
+            # # speed up inference
+            # key[0::6] += self.to_k_xy_lora_geo(encoder_hidden_states[0::6]) * scale
+            # key[3::6] += self.to_k_xy_lora_tex(encoder_hidden_states[3::6]) * scale
+            # key[1::6] += self.to_k_xz_lora_geo(encoder_hidden_states[1::6]) * scale
+            # key[4::6] += self.to_k_xz_lora_tex(encoder_hidden_states[4::6]) * scale
+            # key[2::6] += self.to_k_yz_lora_geo(encoder_hidden_states[2::6]) * scale
+            # key[5::6] += self.to_k_yz_lora_tex(encoder_hidden_states[5::6]) * scale
+            value = attn.to_v(encoder_hidden_states)
+            _value_new = torch.zeros_like(value)
+            # lora for xy plane geometry
+            _value_new[0::6] = self.to_v_xy_lora_geo(encoder_hidden_states[0::6])
+            # lora for xy plane texture
+            _value_new[3::6] = self.to_v_xy_lora_tex(encoder_hidden_states[3::6])
+            # lora for xz plane geometry
+            _value_new[1::6] = self.to_v_xz_lora_geo(encoder_hidden_states[1::6])
+            # lora for xz plane texture
+            _value_new[4::6] = self.to_v_xz_lora_tex(encoder_hidden_states[4::6])
+            # lora for yz plane geometry
+            _value_new[2::6] = self.to_v_yz_lora_geo(encoder_hidden_states[2::6])
+            # lora for yz plane texture
+            _value_new[5::6] = self.to_v_yz_lora_tex(encoder_hidden_states[5::6])
+            value = value + scale * _value_new
+            # # speed up inference
+            # value[0::6] += self.to_v_xy_lora_geo(encoder_hidden_states[0::6]) * scale
+            # value[3::6] += self.to_v_xy_lora_tex(encoder_hidden_states[3::6]) * scale
+            # value[1::6] += self.to_v_xz_lora_geo(encoder_hidden_states[1::6]) * scale
+            # value[4::6] += self.to_v_xz_lora_tex(encoder_hidden_states[4::6]) * scale
+            # value[2::6] += self.to_v_yz_lora_geo(encoder_hidden_states[2::6]) * scale
+            # value[5::6] += self.to_v_yz_lora_tex(encoder_hidden_states[5::6]) * scale
+        elif self.lora_type in ["vanilla", "basic"]:
+            key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+        elif self.lora_type in ["none", ]:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+        else:
+            raise NotImplementedError("The LoRA type is not supported for the key and value in HplaneSelfAttentionLoRAAttnProcessor.")
+        ############################################################################################################
+        # attention scores
+        # in self-attention, query of each plane should be used to calculate the attention scores of all planes
+        if self.lora_type in ["hexa_v1", "vanilla",]:
+            query = attn.head_to_batch_dim(
+                query.view(batch_size // 6, sequence_length * 6, self.hidden_size)
+            )
+            key = attn.head_to_batch_dim(
+                key.view(batch_size // 6, sequence_length * 6, self.hidden_size)
+            )
+            value = attn.head_to_batch_dim(
+                value.view(batch_size // 6, sequence_length * 6, self.hidden_size)
+            )
+            # calculate the attention scores
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            hidden_states = torch.bmm(attention_probs, value)
+            hidden_states = attn.batch_to_head_dim(hidden_states)
+            # split the hidden states into 6 planes
+            hidden_states = hidden_states.view(batch_size, sequence_length, self.hidden_size)
+        elif self.lora_type in ["none", "basic"]:
+            query = attn.head_to_batch_dim(query)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            # calculate the attention scores
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            hidden_states = torch.bmm(attention_probs, value)
+            hidden_states = attn.batch_to_head_dim(hidden_states)
+        else:
+            raise NotImplementedError("The LoRA type is not supported for attention scores calculation in HplaneSelfAttentionLoRAAttnProcessor.")
+        ############################################################################################################
+        # linear proj
+        if self.lora_type in ["hexa_v1", ]:
+            hidden_states = attn.to_out[0](hidden_states)
+            _hidden_states_new = torch.zeros_like(hidden_states)
+            # lora for xy plane geometry
+            _hidden_states_new[0::6] = self.to_out_xy_lora_geo(hidden_states[0::6])
+            # lora for xy plane texture
+            _hidden_states_new[3::6] = self.to_out_xy_lora_tex(hidden_states[3::6])
+            # lora for xz plane geometry
+            _hidden_states_new[1::6] = self.to_out_xz_lora_geo(hidden_states[1::6])
+            # lora for xz plane texture
+            _hidden_states_new[4::6] = self.to_out_xz_lora_tex(hidden_states[4::6])
+            # lora for yz plane geometry
+            _hidden_states_new[2::6] = self.to_out_yz_lora_geo(hidden_states[2::6])
+            # lora for yz plane texture
+            _hidden_states_new[5::6] = self.to_out_yz_lora_tex(hidden_states[5::6])
+            hidden_states = hidden_states + scale * _hidden_states_new
+            # # speed up inference
+            # hidden_states[0::6] += self.to_out_xy_lora_geo(hidden_states[0::6]) * scale
+            # hidden_states[3::6] += self.to_out_xy_lora_tex(hidden_states[3::6]) * scale
+            # hidden_states[1::6] += self.to_out_xz_lora_geo(hidden_states[1::6]) * scale
+            # hidden_states[4::6] += self.to_out_xz_lora_tex(hidden_states[4::6]) * scale
+            # hidden_states[2::6] += self.to_out_yz_lora_geo(hidden_states[2::6]) * scale
+            # hidden_states[5::6] += self.to_out_yz_lora_tex(hidden_states[5::6]) * scale
+        elif self.lora_type in ["vanilla", "basic"]:
+            hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        elif self.lora_type in ["none",]:
+            hidden_states = attn.to_out[0](hidden_states)
+        else:
+            raise NotImplementedError("The LoRA type is not supported for the to_out layer in HplaneSelfAttentionLoRAAttnProcessor.")
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        ############################################################################################################
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class TriplaneCrossAttentionLoRAAttnProcessor(nn.Module):
+    """
+    Perform for implementing the Triplane Cross-Attention LoRA Attention Processor.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        cross_attention_dim: int,
+        rank: int = 4,
+        network_alpha: Optional[float] = None,
+        with_bias: bool = False,
+        lora_type: str = "hexa_v1", # vanilla,
+    ):
+        super().__init__()
+        assert lora_type in ["hexa_v1", "vanilla", "none"], "The LoRA type is not supported."
+        self.hidden_size = hidden_size
+        self.rank = rank
+        self.lora_type = lora_type
+        if lora_type in ["hexa_v1"]:
+            # lora for 1st plane geometry
+            self.to_q_xy_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_xy_lora_geo = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_xy_lora_geo = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_xy_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            # lora for 1st plane texture
+            self.to_q_xy_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_xy_lora_tex = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_xy_lora_tex = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_xy_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            # lora for 2nd plane geometry
+            self.to_q_xz_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_xz_lora_geo = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_xz_lora_geo = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_xz_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            # lora for 2nd plane texture
+            self.to_q_xz_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_xz_lora_tex = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_xz_lora_tex = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_xz_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            # lora for 3nd plane geometry
+            self.to_q_yz_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_yz_lora_geo = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_yz_lora_geo = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_yz_lora_geo = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            # lora for 3nd plane texture
+            self.to_q_yz_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_yz_lora_tex = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_yz_lora_tex = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_yz_lora_tex = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+        elif lora_type in ["vanilla"]:
+            # lora for all planes
+            self.to_q_lora = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_k_lora = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_v_lora = LoRALinearLayerwBias(cross_attention_dim, hidden_size, rank, network_alpha, with_bias=with_bias)
+            self.to_out_lora = LoRALinearLayerwBias(hidden_size, hidden_size, rank, network_alpha, with_bias=with_bias)
+    def __call__(
+        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None
+    ):
+        assert encoder_hidden_states is not None, "The encoder_hidden_states should not be None."
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        ############################################################################################################
+        # query
+        if self.lora_type in ["hexa_v1",]:
+            query = attn.to_q(hidden_states)
+            _query_new = torch.zeros_like(query)
+            # lora for xy plane geometry
+            _query_new[0::6] = self.to_q_xy_lora_geo(hidden_states[0::6])
+            # lora for xy plane texture
+            _query_new[3::6] = self.to_q_xy_lora_tex(hidden_states[3::6])
+            # lora for xz plane geometry
+            _query_new[1::6] = self.to_q_xz_lora_geo(hidden_states[1::6])
+            # lora for xz plane texture
+            _query_new[4::6] = self.to_q_xz_lora_tex(hidden_states[4::6])
+            # lora for yz plane geometry
+            _query_new[2::6] = self.to_q_yz_lora_geo(hidden_states[2::6])
+            # lora for yz plane texture
+            _query_new[5::6] = self.to_q_yz_lora_tex(hidden_states[5::6])
+            query = query + scale * _query_new
+        elif self.lora_type == "vanilla":
+            query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        elif self.lora_type == "none":
+            query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        ############################################################################################################
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        ############################################################################################################
+        # key and value
+        if self.lora_type in ["hexa_v1",]:
+            key = attn.to_k(encoder_hidden_states)
+            _key_new = torch.zeros_like(key)
+            # lora for xy plane geometry
+            _key_new[0::6] = self.to_k_xy_lora_geo(encoder_hidden_states[0::6])
+            # lora for xy plane texture
+            _key_new[3::6] = self.to_k_xy_lora_tex(encoder_hidden_states[3::6])
+            # lora for xz plane geometry
+            _key_new[1::6] = self.to_k_xz_lora_geo(encoder_hidden_states[1::6])
+            # lora for xz plane texture
+            _key_new[4::6] = self.to_k_xz_lora_tex(encoder_hidden_states[4::6])
+            # lora for yz plane geometry
+            _key_new[2::6] = self.to_k_yz_lora_geo(encoder_hidden_states[2::6])
+            # lora for yz plane texture
+            _key_new[5::6] = self.to_k_yz_lora_tex(encoder_hidden_states[5::6])
+            key = key + scale * _key_new
+            value = attn.to_v(encoder_hidden_states)
+            _value_new = torch.zeros_like(value)
+            # lora for xy plane geometry
+            _value_new[0::6] = self.to_v_xy_lora_geo(encoder_hidden_states[0::6])
+            # lora for xy plane texture
+            _value_new[3::6] = self.to_v_xy_lora_tex(encoder_hidden_states[3::6])
+            # lora for xz plane geometry
+            _value_new[1::6] = self.to_v_xz_lora_geo(encoder_hidden_states[1::6])
+            # lora for xz plane texture
+            _value_new[4::6] = self.to_v_xz_lora_tex(encoder_hidden_states[4::6])
+            # lora for yz plane geometry
+            _value_new[2::6] = self.to_v_yz_lora_geo(encoder_hidden_states[2::6])
+            # lora for yz plane texture
+            _value_new[5::6] = self.to_v_yz_lora_tex(encoder_hidden_states[5::6])
+            value = value + scale * _value_new
+        elif self.lora_type in ["vanilla",]:
+            key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+        elif self.lora_type in ["none",]:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        ############################################################################################################
+        # calculate the attention scores
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        ############################################################################################################
+        # linear proj
+        if self.lora_type in ["hexa_v1", ]:
+            hidden_states = attn.to_out[0](hidden_states)
+            _hidden_states_new = torch.zeros_like(hidden_states)
+            # lora for xy plane geometry
+            _hidden_states_new[0::6] = self.to_out_xy_lora_geo(hidden_states[0::6])
+            # lora for xy plane texture
+            _hidden_states_new[3::6] = self.to_out_xy_lora_tex(hidden_states[3::6])
+            # lora for xz plane geometry
+            _hidden_states_new[1::6] = self.to_out_xz_lora_geo(hidden_states[1::6])
+            # lora for xz plane texture
+            _hidden_states_new[4::6] = self.to_out_xz_lora_tex(hidden_states[4::6])
+            # lora for yz plane geometry
+            _hidden_states_new[2::6] = self.to_out_yz_lora_geo(hidden_states[2::6])
+            # lora for yz plane texture
+            _hidden_states_new[5::6] = self.to_out_yz_lora_tex(hidden_states[5::6])
+            hidden_states = hidden_states + scale * _hidden_states_new
+        elif self.lora_type in ["vanilla",]:
+            hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        elif self.lora_type in ["none",]:
+            hidden_states = attn.to_out[0](hidden_states)
+        else:
+            raise NotImplementedError("The LoRA type is not supported for the to_out layer in HplaneCrossAttentionLoRAAttnProcessor.")
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        ############################################################################################################
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+@dataclass
+class GeneratorConfig:
+    training_type: str = "self_lora_rank_16-cross_lora_rank_16-locon_rank_16"
+    output_dim: int = 32
+    self_lora_type: str = "hexa_v1"
+    cross_lora_type: str = "hexa_v1"
+    locon_type: str = "vanilla_v1"
+    vae_attn_type: str = "basic"
+    prompt_bias: bool = False
+class OneStepTriplaneDualStableDiffusion(nn.Module):
+    """
+    One-step Triplane Stable Diffusion module.
+    """
+    def __init__(
+        self,
+        config: Union[dict, GeneratorConfig],
+        vae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+    ):
+        super().__init__()
+        # Convert dict to GeneratorConfig if needed
+        self.cfg = GeneratorConfig(**config) if isinstance(config, dict) else config
+        self.output_dim = self.cfg.output_dim
+        # Load models
+        self.unet = unet
+        self.vae = vae
+        # Get device from one of the models
+        self.device = next(self.unet.parameters()).device
+        # Remove unused components
+        del vae.encoder
+        del vae.quant_conv
+        # Get training type from config
+        training_type = self.cfg.training_type
+        # save trainable parameters
+        if not "full" in training_type: # then paramter-efficient training
+            trainable_params = {}
+            assert "lora" in training_type or "locon" in training_type, "The training type is not supported."
+            @dataclass
+            class SubModules:
+                unet: UNet2DConditionModel
+                vae: AutoencoderKL
+            self.submodules = SubModules(
+                unet=unet.to(self.device),
+                vae=vae.to(self.device),
+            )
+            # free all the parameters
+            for param in self.unet.parameters():
+                param.requires_grad_(False)
+            for param in self.vae.parameters():
+                param.requires_grad_(False)
+                ############################################################
+                # overwrite the unet and vae with the customized processors
+            if "lora" in training_type:
+                # parse the rank from the training type, with the template "lora_rank_{}"
+                assert "self_lora_rank" in training_type, "The self_lora_rank is not specified."
+                rank = re.search(r"self_lora_rank_(\d+)", training_type).group(1)
+                self.self_lora_rank = int(rank)
+                assert "cross_lora_rank" in training_type, "The cross_lora_rank is not specified."
+                rank = re.search(r"cross_lora_rank_(\d+)", training_type).group(1)
+                self.cross_lora_rank = int(rank)
+                # if the finetuning is with bias
+                self.w_lora_bias = False
+                if "with_bias" in training_type:
+                    self.w_lora_bias = True
+                # specify the attn_processor for unet
+                lora_attn_procs = self._set_attn_processor(
+                    self.unet,
+                    self_attn_name="attn1.processor",
+                    self_lora_type=self.cfg.self_lora_type,
+                    cross_lora_type=self.cfg.cross_lora_type
+                )
+                self.unet.set_attn_processor(lora_attn_procs)
+                # update the trainable parameters
+                trainable_params.update(self.unet.attn_processors)
+                # specify the attn_processor for vae
+                lora_attn_procs = self._set_attn_processor(
+                    self.vae,
+                    self_attn_name="processor",
+                    self_lora_type=self.cfg.vae_attn_type, # hard-coded for vae
+                    cross_lora_type="vanilla"
+                )
+                self.vae.set_attn_processor(lora_attn_procs)
+                # update the trainable parameters
+                trainable_params.update(self.vae.attn_processors)
+            else:
+                raise NotImplementedError("The training type is not supported.")
+            if "locon" in training_type:
+                # parse the rank from the training type, with the template "locon_rank_{}"
+                rank = re.search(r"locon_rank_(\d+)", training_type).group(1)
+                self.locon_rank = int(rank)
+                # if the finetuning is with bias
+                self.w_locon_bias = False
+                if "with_bias" in training_type:
+                    self.w_locon_bias = True
+                # specify the conv_processor for unet
+                locon_procs = self._set_conv_processor(
+                    self.unet,
+                    locon_type=self.cfg.locon_type
+                )
+                # update the trainable parameters
+                trainable_params.update(locon_procs)
+                # specify the conv_processor for vae
+                locon_procs = self._set_conv_processor(
+                    self.vae,
+                    locon_type="vanilla_v1", # hard-coded for vae decoder
+                )
+                # update the trainable parameters
+                trainable_params.update(locon_procs)
+            else:
+                raise NotImplementedError("The training type is not supported.")
+            # overwrite the outconv
+            # conv_out_orig = self.vae.decoder.conv_out
+            conv_out_new = nn.Conv2d(
+                in_channels=128, # conv_out_orig.in_channels, hard-coded
+                out_channels=self.cfg.output_dim, kernel_size=3, padding=1
+            )
+            # update the trainable parameters
+            self.vae.decoder.conv_out = conv_out_new
+            trainable_params["vae.decoder.conv_out"] = conv_out_new
+            # save the trainable parameters
+            self.peft_layers = AttnProcsLayers(trainable_params).to(self.device)
+            self.peft_layers._load_state_dict_pre_hooks.clear()
+            self.peft_layers._state_dict_hooks.clear()
+        # hard-coded for now
+        self.num_planes = 6
+        if self.cfg.prompt_bias:
+            self.prompt_bias = nn.Parameter(torch.zeros(self.num_planes, 77, 1024))
+    @property
+    def unet(self):
+        return self.submodules.unet
+    @property
+    def vae(self):
+        return self.submodules.vae
+    def _set_conv_processor(
+        self,
+        module,
+        conv_name: str = "LoRACompatibleConv",
+        locon_type: str = "vanilla_v1",
+    ):
+        locon_procs = {}
+        for _name, _module in module.named_modules():
+            if _module.__class__.__name__ == conv_name:
+                # append the locon processor to the module
+                locon_proc = TriplaneLoRAConv2dLayer(
+                    in_features=_module.in_channels,
+                    out_features=_module.out_channels,
+                    rank=self.locon_rank,
+                    kernel_size=_module.kernel_size,
+                    stride=_module.stride,
+                    padding=_module.padding,
+                    with_bias = self.w_locon_bias,
+                    locon_type= locon_type,
+                )
+                # add the locon processor to the module
+                _module.lora_layer = locon_proc
+                # update the trainable parameters
+                key_name = f"{_name}.lora_layer"
+                locon_procs[key_name] = locon_proc
+        return locon_procs
+    def _set_attn_processor(
+            self,
+            module,
+            self_attn_name: str = "attn1.processor",
+            self_attn_procs = TriplaneSelfAttentionLoRAAttnProcessor,
+            self_lora_type: str = "hexa_v1",
+            cross_attn_procs = TriplaneCrossAttentionLoRAAttnProcessor,
+            cross_lora_type: str = "hexa_v1",
+        ):
+        lora_attn_procs = {}
+        for name in module.attn_processors.keys():
+            if name.startswith("mid_block"):
+                hidden_size = module.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(module.config.block_out_channels))[
+                    block_id
+                ]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = module.config.block_out_channels[block_id]
+            elif name.startswith("decoder"):
+                # special case for decoder in SD
+                hidden_size = 512
+            if name.endswith(self_attn_name):
+                # it is self-attention
+                cross_attention_dim = None
+                lora_attn_procs[name] = self_attn_procs(
+                    hidden_size, self.self_lora_rank, with_bias = self.w_lora_bias,
+                    lora_type = self_lora_type
+                )
+            else:
+                # it is cross-attention
+                cross_attention_dim = module.config.cross_attention_dim
+                lora_attn_procs[name] = cross_attn_procs(
+                    hidden_size, cross_attention_dim, self.cross_lora_rank, with_bias = self.w_lora_bias,
+                    lora_type = cross_lora_type
+                )
+        return lora_attn_procs
+    def forward(
+        self,
+        text_embed,
+        styles,
+    ):
+        return None
+    def forward_denoise(
+        self,
+        text_embed,
+        noisy_input,
+        t,
+    ):
+        batch_size = text_embed.size(0)
+        noise_shape = noisy_input.size(-2)
+        if text_embed.ndim == 3:
+            # same text_embed for all planes
+            # text_embed = text_embed.repeat(self.num_planes, 1, 1) # wrong!!!
+            text_embed = text_embed.repeat_interleave(self.num_planes, dim=0)
+        elif text_embed.ndim == 4:
+            # different text_embed for each plane
+            text_embed = text_embed.view(batch_size * self.num_planes, *text_embed.shape[-2:])
+        else:
+            raise ValueError("The text_embed should be either 3D or 4D.")
+        if hasattr(self, "prompt_bias"):
+            text_embed = text_embed + self.prompt_bias.repeat(batch_size, 1, 1) * self.cfg.prompt_bias_lr_multiplier
+        noisy_input = noisy_input.view(-1, 4, noise_shape, noise_shape)
+        noise_pred = self.unet(
+            noisy_input,
+            t,
+            encoder_hidden_states=text_embed
+        ).sample
+        return noise_pred
+    def forward_decode(
+        self,
+        latents,
+    ):
+        latents = latents.view(-1, 4, *latents.shape[-2:])
+        triplane = self.vae.decode(latents).sample
+        triplane = triplane.view(-1, self.num_planes, self.cfg.output_dim, *triplane.shape[-2:])
+        return triplane

triplaneturbo_executable/models/geometry/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .sd_dual_triplanes import StableDiffusionTriplaneDualAttention, StableDiffusionTriplaneDualAttentionConfig

triplaneturbo_executable/models/geometry/sd_dual_triplanes.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import os
+from dataclasses import dataclass, field
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from jaxtyping import Float
+from torch import Tensor
+from typing import *
+from ...utils.general_utils import contract_to_unisphere_custom, sample_from_planes
+from diffusers import StableDiffusionPipeline, AutoencoderKL, UNet2DConditionModel
+from ..networks import get_mlp
+from ...utils.general_utils import config_to_primitive
+@dataclass
+class StableDiffusionTriplaneDualAttentionConfig:
+    n_feature_dims: int = 3
+    space_generator_config: dict = field(
+        default_factory=lambda: {
+            "pretrained_model_name_or_path": "stable-diffusion-2-1-base",
+            "training_type": "self_lora_rank_16-cross_lora_rank_16-locon_rank_16",
+            "output_dim": 32,
+            "gradient_checkpoint": False,
+            "self_lora_type": "hexa_v1",
+            "cross_lora_type": "hexa_v1",
+            "locon_type": "vanilla_v1",
+        }
+    )
+    mlp_network_config: dict = field(
+        default_factory=lambda: {
+            "otype": "VanillaMLP",
+            "activation": "ReLU",
+            "output_activation": "none",
+            "n_neurons": 64,
+            "n_hidden_layers": 2,
+        }
+    )
+    backbone: str = "one_step_triplane_dual_stable_diffusion"
+    finite_difference_normal_eps: Union[
+        float, str
+    ] = 0.01  # in [float, "progressive"]    finite_difference_normal_eps: Union[float, str] = 0.01
+    sdf_bias: Union[float, str] = 0.0
+    sdf_bias_params: Optional[Any] = None
+    isosurface_remove_outliers: bool = False
+    # rotate planes to fit the conventional direction of image generated by SD
+    # in right-handed coordinate system
+    # xy plane should looks that a img from top-down / bottom-up view
+    # xz plane should looks that a img from right-left / left-right view
+    # yz plane should looks that a img from front-back / back-front view
+    rotate_planes: Optional[str] = None
+    split_channels: Optional[str] = None
+    geo_interpolate: str = "v1"
+    tex_interpolate: str = "v1"
+    isosurface_deformable_grid: bool = True
+class StableDiffusionTriplaneDualAttention(nn.Module):
+    def __init__(
+        self,
+        config: StableDiffusionTriplaneDualAttentionConfig,
+        vae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+    ):
+        super().__init__()
+        self.cfg = StableDiffusionTriplaneDualAttentionConfig(**config) if isinstance(config, dict) else config
+        # set up the space generator
+        from ...extern.sd_dual_triplane_modules import OneStepTriplaneDualStableDiffusion as Generator
+        self.space_generator = Generator(
+            self.cfg.space_generator_config,
+            vae=vae,
+            unet=unet,
+        )
+        input_dim = self.space_generator.output_dim # feat_xy + feat_xz + feat_yz
+        assert self.cfg.split_channels in [None, "v1"]
+        if self.cfg.split_channels in ["v1"]: # split geometry and texture
+            input_dim = input_dim // 2
+        assert self.cfg.geo_interpolate in ["v1", "v2"]
+        if self.cfg.geo_interpolate in ["v2"]:
+            geo_input_dim = input_dim * 3 # concat[feat_xy, feat_xz, feat_yz]
+        else:
+            geo_input_dim = input_dim # feat_xy + feat_xz + feat_yz
+        assert self.cfg.tex_interpolate in ["v1", "v2"]
+        if self.cfg.tex_interpolate in ["v2"]:
+            tex_input_dim = input_dim * 3 # concat[feat_xy, feat_xz, feat_yz]
+        else:
+            tex_input_dim = input_dim # feat_xy + feat_xz + feat_yz
+        self.sdf_network = get_mlp(
+            geo_input_dim,
+            1,
+            self.cfg.mlp_network_config,
+        )
+        if self.cfg.n_feature_dims > 0:
+            self.feature_network = get_mlp(
+                tex_input_dim,
+                self.cfg.n_feature_dims,
+                self.cfg.mlp_network_config,
+            )
+        if self.cfg.isosurface_deformable_grid:
+            self.deformation_network = get_mlp(
+                geo_input_dim,
+                3,
+                self.cfg.mlp_network_config,
+            )
+        # hard-coded for now
+        self.unbounded = False
+        radius = 1.0
+        self.register_buffer(
+            "bbox",
+            torch.as_tensor(
+                [
+                    [-radius, -radius, -radius],
+                    [radius, radius, radius],
+                ],
+                dtype=torch.float32,
+            )
+        )
+    def initialize_shape(self) -> None:
+        # not used
+        pass
+    def get_shifted_sdf(
+        self,
+        points: Float[Tensor, "*N Di"],
+        sdf: Float[Tensor, "*N 1"]
+    ) -> Float[Tensor, "*N 1"]:
+        sdf_bias: Union[float, Float[Tensor, "*N 1"]]
+        if self.cfg.sdf_bias == "ellipsoid":
+            assert (
+                isinstance(self.cfg.sdf_bias_params, Sized)
+                and len(self.cfg.sdf_bias_params) == 3
+            )
+            size = torch.as_tensor(self.cfg.sdf_bias_params).to(points)
+            sdf_bias = ((points / size) ** 2).sum(
+                dim=-1, keepdim=True
+            ).sqrt() - 1.0  # pseudo signed distance of an ellipsoid
+        elif self.cfg.sdf_bias == "sphere":
+            assert isinstance(self.cfg.sdf_bias_params, float)
+            radius = self.cfg.sdf_bias_params
+            sdf_bias = (points**2).sum(dim=-1, keepdim=True).sqrt() - radius
+        elif isinstance(self.cfg.sdf_bias, float):
+            sdf_bias = self.cfg.sdf_bias
+        else:
+            raise ValueError(f"Unknown sdf bias {self.cfg.sdf_bias}")
+        return sdf + sdf_bias
+    def generate_space_cache(
+        self,
+        styles: Float[Tensor, "B Z"],
+        text_embed: Float[Tensor, "B C"],
+    ) -> Any:
+        output = self.space_generator(
+            text_embed = text_embed,
+            styles = styles,
+        )
+        return output
+    def denoise(
+        self,
+        noisy_input: Any,
+        text_embed: Float[Tensor, "B C"],
+        timestep
+    ) -> Any:
+        output = self.space_generator.forward_denoise(
+            text_embed = text_embed,
+            noisy_input = noisy_input,
+            t = timestep
+        )
+        return output
+    def decode(
+        self,
+        latents: Any,
+    ) -> Any:
+        triplane = self.space_generator.forward_decode(
+            latents = latents
+        )
+        if self.cfg.split_channels == None:
+            return triplane
+        elif self.cfg.split_channels == "v1":
+            B, _, C, H, W = triplane.shape
+            # geometry triplane uses the first n_feature_dims // 2 channels
+            # texture triplane uses the last n_feature_dims // 2 channels
+            used_indices_geo = torch.tensor([True] * (self.space_generator.output_dim// 2) + [False] * (self.space_generator.output_dim // 2))
+            used_indices_tex = torch.tensor([False] * (self.space_generator.output_dim // 2) + [True] * (self.space_generator.output_dim // 2))
+            used_indices = torch.stack([used_indices_geo] * 3 + [used_indices_tex] * 3, dim=0).to(triplane.device)
+            return triplane[:, used_indices].view(B, 6, C//2, H, W)
+    def interpolate_encodings(
+        self,
+        points: Float[Tensor, "*N Di"],
+        space_cache: Float[Tensor, "B 3 C//3 H W"],
+        only_geo: bool = False,
+    ):
+        batch_size, n_points, n_dims = points.shape
+        # the following code is similar to EG3D / OpenLRM
+        assert self.cfg.rotate_planes in [None, "v1", "v2"]
+        if self.cfg.rotate_planes == None:
+            raise NotImplementedError("rotate_planes == None is not implemented yet.")
+        space_cache_rotated = torch.zeros_like(space_cache)
+        if self.cfg.rotate_planes == "v1":
+            # xy plane, diagonal-wise
+            space_cache_rotated[:, 0::3] = torch.transpose(
+                space_cache[:, 0::3], 3, 4
+            )
+            # xz plane, rotate 180° counterclockwise
+            space_cache_rotated[:, 1::3] = torch.rot90(
+                space_cache[:, 1::3], k=2, dims=(3, 4)
+            )
+            # zy plane, rotate 90° clockwise
+            space_cache_rotated[:, 2::3] = torch.rot90(
+                space_cache[:, 2::3], k=-1, dims=(3, 4)
+            )
+        elif self.cfg.rotate_planes == "v2":
+            # all are the same as v1, except for the xy plane
+            # xy plane, row-wise flip
+            space_cache_rotated[:, 0::3] = torch.flip(
+                space_cache[:, 0::3], dims=(4,)
+            )
+            # xz plane, rotate 180° counterclockwise
+            space_cache_rotated[:, 1::3] = torch.rot90(
+                space_cache[:, 1::3], k=2, dims=(3, 4)
+            )
+            # zy plane, rotate 90° clockwise
+            space_cache_rotated[:, 2::3] = torch.rot90(
+                space_cache[:, 2::3], k=-1, dims=(3, 4)
+            )
+        # the 0, 1, 2 axis of the space_cache_rotated is for geometry
+        geo_feat = sample_from_planes(
+            plane_features = space_cache_rotated[:, 0:3].contiguous(),
+            coordinates = points,
+            interpolate_feat = self.cfg.geo_interpolate
+        ).view(*points.shape[:-1],-1)
+        if only_geo:
+            return geo_feat
+        else:
+            # the 3, 4, 5 axis of the space_cache is for texture
+            tex_feat = sample_from_planes(
+                plane_features = space_cache_rotated[:, 3:6].contiguous(),
+                coordinates = points,
+                interpolate_feat = self.cfg.tex_interpolate
+            ).view(*points.shape[:-1],-1)
+            return geo_feat, tex_feat
+    def rescale_points(
+        self,
+        points: Float[Tensor, "*N Di"],
+    ):
+        # transform points from original space to [-1, 1]^3
+        points = contract_to_unisphere_custom(
+            points,
+            self.bbox,
+            self.unbounded
+        )
+        return points
+    def forward(
+        self,
+        points: Float[Tensor, "*N Di"],
+        space_cache: Any,
+    ) -> Dict[str, Float[Tensor, "..."]]:
+        batch_size, n_points, n_dims = points.shape
+        points_unscaled = points
+        points = self.rescale_points(points)
+        enc_geo, enc_tex = self.interpolate_encodings(points, space_cache)
+        sdf_orig = self.sdf_network(enc_geo).view(*points.shape[:-1], 1)
+        sdf = self.get_shifted_sdf(points_unscaled, sdf_orig)
+        output = {
+                "sdf": sdf.view(batch_size * n_points, 1), # reshape to [B*N, 1]
+        }
+        if self.cfg.n_feature_dims > 0:
+            features = self.feature_network(enc_tex).view(
+                *points.shape[:-1], self.cfg.n_feature_dims)
+            output.update(
+                {
+                    "features": features.view(batch_size * n_points, self.cfg.n_feature_dims)
+                }
+            )
+        return output
+    def forward_sdf(
+        self,
+        points: Float[Tensor, "*N Di"],
+        space_cache: Float[Tensor, "B 3 C//3 H W"],
+    ) -> Float[Tensor, "*N 1"]:
+        batch_size = points.shape[0]
+        assert points.shape[0] == batch_size, "points and space_cache should have the same batch size in forward_sdf"
+        points_unscaled = points
+        points = self.rescale_points(points)
+        # sample from planes
+        enc_geo = self.interpolate_encodings(
+            points.reshape(batch_size, -1, 3),
+            space_cache,
+            only_geo = True
+        ).reshape(*points.shape[:-1], -1)
+        sdf = self.sdf_network(enc_geo).reshape(*points.shape[:-1], 1)
+        sdf = self.get_shifted_sdf(points_unscaled, sdf)
+        return sdf
+    def forward_field(
+        self,
+        points: Float[Tensor, "*N Di"],
+        space_cache: Float[Tensor, "B 3 C//3 H W"],
+    ) -> Tuple[Float[Tensor, "*N 1"], Optional[Float[Tensor, "*N 3"]]]:
+        batch_size = points.shape[0]
+        assert points.shape[0] == batch_size, "points and space_cache should have the same batch size in forward_sdf"
+        points_unscaled = points
+        points = self.rescale_points(points)
+        # sample from planes
+        enc_geo = self.interpolate_encodings(points, space_cache, only_geo = True)
+        sdf = self.sdf_network(enc_geo).reshape(*points.shape[:-1], 1)
+        sdf = self.get_shifted_sdf(points_unscaled, sdf)
+        deformation: Optional[Float[Tensor, "*N 3"]] = None
+        if self.cfg.isosurface_deformable_grid:
+            deformation = self.deformation_network(enc_geo).reshape(*points.shape[:-1], 3)
+        return sdf, deformation
+    def forward_level(
+        self, field: Float[Tensor, "*N 1"], threshold: float
+    ) -> Float[Tensor, "*N 1"]:
+        # TODO: is this function correct?
+        return field - threshold
+    def export(
+        self,
+        points: Float[Tensor, "*N Di"],
+        space_cache: Float[Tensor, "B 3 C//3 H W"],
+    **kwargs) -> Dict[str, Any]:
+        # TODO: is this function correct?
+        out: Dict[str, Any] = {}
+        if self.cfg.n_feature_dims == 0:
+            return out
+        orig_shape = points.shape
+        points = points.view(1, -1, 3)
+        # assume the batch size is 1
+        points_unscaled = points
+        points = self.rescale_points(points)
+        # sample from planes
+        _, enc_tex = self.interpolate_encodings(points, space_cache)
+        features = self.feature_network(enc_tex).view(
+            *points.shape[:-1], self.cfg.n_feature_dims
+        )
+        out.update(
+            {
+                "features": features.view(orig_shape[:-1] + (self.cfg.n_feature_dims,))
+            }
+        )
+        return out
+    def train(self, mode=True):
+        super().train(mode)
+        self.space_generator.train(mode)
+    def eval(self):
+        super().eval()
+        self.space_generator.eval()

triplaneturbo_executable/models/networks.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..utils.general_utils import config_to_primitive
+from dataclasses import dataclass
+from typing import Optional, Literal
+def get_activation(name):
+    if name is None:
+        return lambda x: x
+    name = name.lower()
+    if name == "none":
+        return lambda x: x
+    elif name == "sigmoid-mipnerf":
+        return lambda x: torch.sigmoid(x) * (1 + 2*0.001) - 0.001  # Uses sigmoid clamping from MipNeRF
+    else:
+        try:
+            return getattr(F, name)
+        except AttributeError:
+            raise ValueError(f"Unknown activation function: {name}")
+class VanillaMLP(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, config: dict):
+        super().__init__()
+        # Convert dict to MLPConfig if needed
+        if isinstance(config, dict):
+            config = MLPConfig(**config)
+        self.n_neurons = config.n_neurons
+        self.n_hidden_layers = config.n_hidden_layers
+        layers = [
+            self.make_linear(dim_in, self.n_neurons, is_first=True, is_last=False),
+            self.make_activation(),
+        ]
+        for i in range(self.n_hidden_layers - 1):
+            layers += [
+                self.make_linear(
+                    self.n_neurons, self.n_neurons, is_first=False, is_last=False
+                ),
+                self.make_activation(),
+            ]
+        layers += [
+            self.make_linear(self.n_neurons, dim_out, is_first=False, is_last=True)
+        ]
+        self.layers = nn.Sequential(*layers)
+        self.output_activation = get_activation(config.output_activation)
+    def forward(self, x):
+        # disable autocast
+        # strange that the parameters will have empty gradients if autocast is enabled in AMP
+        with torch.cuda.amp.autocast(enabled=False):
+            x = self.layers(x)
+            x = self.output_activation(x)
+        return x
+    def make_linear(self, dim_in, dim_out, is_first, is_last):
+        layer = nn.Linear(dim_in, dim_out, bias=False)
+        return layer
+    def make_activation(self):
+        return nn.ReLU(inplace=True)
+@dataclass
+class MLPConfig:
+    otype: str = "VanillaMLP"
+    activation: str = "ReLU"
+    output_activation: str = "none"
+    n_neurons: int = 64
+    n_hidden_layers: int = 2
+def get_mlp(input_dim: int, output_dim: int, config: dict) -> nn.Module:
+    """Create MLP network based on config"""
+    # Convert dict to MLPConfig
+    if isinstance(config, dict):
+        config = MLPConfig(**config)
+    if config.otype == "VanillaMLP":
+        network = VanillaMLP(input_dim, output_dim, config)
+    else:
+        raise ValueError(f"Unknown MLP type: {config.otype}")
+    return network

triplaneturbo_executable/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .triplaneturbo_text_to_3d import (
+    TriplaneTurboTextTo3DPipeline,
+    TriplaneTurboTextTo3DPipelineConfig
+)
+__all__ = [
+    "TriplaneTurboTextTo3DPipeline",
+    "TriplaneTurboTextTo3DPipelineConfig"
+]

triplaneturbo_executable/pipelines/base.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from typing import *
+import torch
+import torch.nn as nn
+from diffusers import DiffusionPipeline
+class Pipeline(DiffusionPipeline):
+    """Base class for all pipelines."""
+    def __init__(self):
+        super().__init__()
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError
+    def enable_xformers_memory_efficient_attention(self):
+        pass
+    def enable_model_cpu_offload(self):
+        pass
+    @property
+    def device(self) -> torch.device:
+        for model in self.models.values():
+            if hasattr(model, 'device'):
+                return model.device
+        for model in self.models.values():
+            if hasattr(model, 'parameters'):
+                return next(model.parameters()).device
+        raise RuntimeError("No device found.")
+    def to(self, device: torch.device) -> None:
+        for model in self.models.values():
+            model.to(device)

triplaneturbo_executable/pipelines/triplaneturbo_text_to_3d.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import os
+import re
+import json
+from tqdm import tqdm
+import torch
+from typing import *
+from dataclasses import dataclass, field
+from diffusers import StableDiffusionPipeline
+from .base import Pipeline
+from ..models.geometry import StableDiffusionTriplaneDualAttention
+from ..utils.mesh_exporter import isosurface, colorize_mesh, DiffMarchingCubeHelper
+from diffusers.loaders import AttnProcsLayers
+from ..models.networks import get_activation
+@dataclass
+class TriplaneTurboTextTo3DPipelineConfig:
+    """Configuration for TriplaneTurboTextTo3DPipeline"""
+    # Basic pipeline settings
+    base_model_name_or_path: str = "pretrained/stable-diffusion-2-1-base"
+    num_inference_steps: int = 4
+    num_results_per_prompt: int = 1
+    latent_channels: int = 4
+    latent_height: int = 64
+    latent_width: int = 64
+    # Training/sampling settings
+    num_steps_sampling: int = 4
+    # Geometry settings
+    radius: float = 1.0
+    normal_type: str = "analytic"
+    sdf_bias: str = "sphere"
+    sdf_bias_params: float = 0.5
+    rotate_planes: str = "v1"
+    split_channels: str = "v1"
+    geo_interpolate: str = "v1"
+    tex_interpolate: str = "v2"
+    n_feature_dims: int = 3
+    sample_scheduler: str = "ddim" # any of "ddpm", "ddim"
+    # Network settings
+    mlp_network_config: dict = field(
+        default_factory=lambda: {
+            "otype": "VanillaMLP",
+            "activation": "ReLU",
+            "output_activation": "none",
+            "n_neurons": 64,
+            "n_hidden_layers": 2,
+        }
+    )
+    # Adapter settings
+    space_generator_config: dict = field(
+        default_factory=lambda: {
+            "training_type": "self_lora_rank_16-cross_lora_rank_16-locon_rank_16" ,
+            "output_dim": 64,  # 32 * 2 for v1
+            "self_lora_type": "hexa_v1",
+            "cross_lora_type": "vanilla",
+            "locon_type": "vanilla_v1",
+            "prompt_bias": False,
+            "vae_attn_type": "basic",  # "basic", "vanilla"
+        }
+    )
+    isosurface_deformable_grid: bool = True
+    isosurface_resolution: int = 160
+    color_activation: str = "sigmoid-mipnerf"
+    @classmethod
+    def from_pretrained(cls, pretrained_path: str) -> "TriplaneTurboTextTo3DPipelineConfig":
+        """Load config from pretrained path"""
+        config_path = os.path.join(pretrained_path, "config.json")
+        if os.path.exists(config_path):
+            with open(config_path, "r") as f:
+                config_dict = json.load(f)
+            return cls(**config_dict)
+        else:
+            print(f"No config file found at {pretrained_path}, using default config")
+            return cls()  # Return default config if no config file found
+class TriplaneTurboTextTo3DPipeline(Pipeline):
+    """
+    A pipeline for converting text to 3D models using triplane representation.
+    """
+    config_name = "config.json"
+    def __init__(
+        self,
+        geometry: StableDiffusionTriplaneDualAttention,
+        material: Callable,
+        base_pipeline: StableDiffusionPipeline,
+        sample_scheduler: Callable,
+        isosurface_helper: Callable,
+        **kwargs,
+    ):
+        super().__init__()
+        self.geometry = geometry
+        self.material = material
+        self.base_pipeline = base_pipeline
+        self.sample_scheduler = sample_scheduler
+        self.isosurface_helper = isosurface_helper
+        self.models = {
+            "geometry": geometry,
+            "base_pipeline": base_pipeline,
+        }
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        **kwargs,
+    ):
+        """
+        Load pretrained adapter weights, config and update pipeline components.
+        Args:
+            pretrained_model_name_or_path: Path to pretrained adapter weights
+            base_pipeline: Optional base pipeline instance
+            **kwargs: Additional arguments to override config values
+        Returns:
+            pipeline: Updated pipeline instance
+        """
+        # Load config from pretrained path
+        config = TriplaneTurboTextTo3DPipelineConfig.from_pretrained(
+            pretrained_model_name_or_path,
+            **kwargs,
+        )
+        # load base pipeline
+        base_pipeline = StableDiffusionPipeline.from_pretrained(
+            config.base_model_name_or_path,
+            **kwargs,
+        )
+        # load sample scheduler
+        if config.sample_scheduler == "ddim":
+            from diffusers import DDIMScheduler
+            sample_scheduler = DDIMScheduler.from_pretrained(
+                config.base_model_name_or_path,
+                subfolder="scheduler",
+            )
+        else:
+            raise ValueError(f"Unknown sample scheduler: {config.sample_scheduler}")
+        # load geometry
+        geometry = StableDiffusionTriplaneDualAttention(
+                config=config,
+                vae=base_pipeline.vae,
+                unet=base_pipeline.unet,
+            )
+        # no gradient for geometry
+        for param in geometry.parameters():
+            param.requires_grad = False
+        # and load adapter weights
+        if pretrained_model_name_or_path.endswith(".pth"):
+            state_dict = torch.load(pretrained_model_name_or_path)["state_dict"]
+            new_state_dict = {}
+            for key, value in state_dict.items():
+                new_key = key.replace("geometry.", "")
+                new_state_dict[new_key] = value
+            _, unused = geometry.load_state_dict(new_state_dict, strict=False)
+            if len(unused) > 0:
+                print(f"Unused keys: {unused}")
+        else:
+            raise ValueError(f"Unknown pretrained model name or path: {pretrained_model_name_or_path}")
+        # load material, convert to int
+        # material = lambda x: (256 * get_activation(config.color_activation)(x)).int()
+        material = get_activation(config.color_activation)
+        # Load geometry model
+        pipeline = cls(
+            base_pipeline=base_pipeline,
+            geometry=geometry,
+            sample_scheduler=sample_scheduler,
+            material=material,
+            isosurface_helper=DiffMarchingCubeHelper(
+                resolution=config.isosurface_resolution,
+            ),
+            **kwargs,
+        )
+        return pipeline
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: str,
+        num_results_per_prompt: int = 1,
+    ) -> torch.FloatTensor:
+        """
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt: The prompt to encode.
+            device: The device to use for encoding.
+            num_results_per_prompt: Number of results to generate per prompt.
+            do_classifier_free_guidance: Whether to use classifier-free guidance.
+            negative_prompt: The negative prompt to encode.
+        Returns:
+            text_embeddings: Text embeddings tensor.
+        """
+        # Use base_pipeline to encode prompt
+        text_embeddings = self.base_pipeline.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_results_per_prompt,
+            do_classifier_free_guidance=False,
+            negative_prompt=None
+        )
+        return text_embeddings
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        num_inference_steps: int = 4,
+        num_results_per_prompt: int = 1,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+        colorize: bool = True,
+        **kwargs,
+    ):
+        # Implementation similar to Zero123Pipeline
+        # Reference code from: https://github.com/zero123/zero123-diffusers
+        # Validate inputs
+        if isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"Prompt must be a string or list of strings, got {type(prompt)}")
+        # Get the device from the first available module
+        # Generate latents if not provided
+        if latents is None:
+            latents = torch.randn(
+                (batch_size * 6, 4, 32, 32), # hard-coded for now
+                generator=generator,
+                device=self.device,
+            )
+        # Process text prompt through geometry module
+        text_embed, _ = self.encode_prompt(prompt, self.device, num_results_per_prompt)
+        # Run diffusion process
+        # Set up timesteps for sampling
+        timesteps = self._set_timesteps(
+            self.sample_scheduler,
+            num_inference_steps
+        )
+        with torch.no_grad():
+            # Run diffusion process
+            for i, t in tqdm(enumerate(timesteps)):
+                # Scale model input
+                noisy_latent_input = self.sample_scheduler.scale_model_input(
+                    latents,
+                    t
+                )
+                # Predict noise/sample
+                pred = self.geometry.denoise(
+                    noisy_input=noisy_latent_input,
+                    text_embed=text_embed,
+                    timestep=t.to(self.device),
+                )
+                # Update latents
+                results = self.sample_scheduler.step(pred, t, latents)
+                latents = results.prev_sample
+                latents_denoised = results.pred_original_sample
+            # Use final denoised latents
+            latents = latents_denoised
+            # Generate final 3D representation
+            space_cache = self.geometry.decode(latents)
+            # Extract mesh from space cache
+            mesh_list = isosurface(
+                space_cache,
+                self.geometry.forward_field,
+                self.isosurface_helper,
+            )
+            if colorize:
+                mesh_list = colorize_mesh(
+                    space_cache,
+                    self.geometry.export,
+                    mesh_list,
+                    activation=self.material,
+                )
+        # decide output type based on return_dict
+        if return_dict:
+            return {
+                "space_cache": space_cache,
+                "latents": latents,
+                "mesh": mesh_list,
+            }
+        else:
+            return mesh_list
+    def _set_timesteps(
+        self,
+        scheduler,
+        num_steps: int,
+    ):
+        """Set up timesteps for sampling.
+        Args:
+            scheduler: The scheduler to use for timestep generation
+            num_steps: Number of diffusion steps
+        Returns:
+            timesteps: Tensor of timesteps to use for sampling
+        """
+        scheduler.set_timesteps(num_steps)
+        timesteps_orig = scheduler.timesteps
+        # Shift timesteps to start from T
+        timesteps_delta = scheduler.config.num_train_timesteps - 1 - timesteps_orig.max()
+        timesteps = timesteps_orig + timesteps_delta
+        return timesteps

triplaneturbo_executable/utils/__init__.py ADDED Viewed

File without changes

triplaneturbo_executable/utils/general_utils.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from typing import *
+from jaxtyping import Float
+from omegaconf import OmegaConf
+def config_to_primitive(config, resolve: bool = True) -> Any:
+    return OmegaConf.to_container(config, resolve=resolve)
+def scale_tensor(
+    dat: Float[Tensor, "... D"],
+    inp_scale: Union[Tuple[float, float], Float[Tensor, "2 D"]],
+    tgt_scale: Union[Tuple[float, float], Float[Tensor, "2 D"]]
+):
+    if inp_scale is None:
+        inp_scale = (0, 1)
+    if tgt_scale is None:
+        tgt_scale = (0, 1)
+    if isinstance(tgt_scale, Tensor):
+        assert dat.shape[-1] == tgt_scale.shape[-1]
+    dat = (dat - inp_scale[0]) / (inp_scale[1] - inp_scale[0])
+    dat = dat * (tgt_scale[1] - tgt_scale[0]) + tgt_scale[0]
+    return dat
+def contract_to_unisphere_custom(
+    x: Float[Tensor, "... 3"], bbox: Float[Tensor, "2 3"], unbounded: bool = False
+) -> Float[Tensor, "... 3"]:
+    if unbounded:
+        x = scale_tensor(x, bbox, (-1, 1))
+        x = x * 2 - 1  # aabb is at [-1, 1]
+        mag = x.norm(dim=-1, keepdim=True)
+        mask = mag.squeeze(-1) > 1
+        x[mask] = (2 - 1 / mag[mask]) * (x[mask] / mag[mask])
+        x = x / 4 + 0.5  # [-inf, inf] is at [0, 1]
+    else:
+        x = scale_tensor(x, bbox, (-1, 1))
+    return x
+# bug fix in https://github.com/NVlabs/eg3d/issues/67
+planes =  torch.tensor(
+            [
+                [
+                    [1, 0, 0],
+                    [0, 1, 0],
+                    [0, 0, 1]
+                ],
+                [
+                    [1, 0, 0],
+                    [0, 0, 1],
+                    [0, 1, 0]
+                ],
+                [
+                    [0, 0, 1],
+                    [0, 1, 0],
+                    [1, 0, 0]
+                ]
+            ], dtype=torch.float32)
+def grid_sample(input, grid):
+    # if grid.requires_grad and _should_use_custom_op():
+    #     return grid_sample_2d(input, grid, padding_mode='zeros', align_corners=False)
+    return torch.nn.functional.grid_sample(input=input, grid=grid, mode='bilinear', padding_mode='zeros', align_corners=False)
+def project_onto_planes(planes, coordinates):
+    """
+    Does a projection of a 3D point onto a batch of 2D planes,
+    returning 2D plane coordinates.
+    Takes plane axes of shape n_planes, 3, 3
+    # Takes coordinates of shape N, M, 3
+    # returns projections of shape N*n_planes, M, 2
+    """
+    N, M, C = coordinates.shape
+    n_planes, _, _ = planes.shape
+    coordinates = coordinates.unsqueeze(1).expand(-1, n_planes, -1, -1).reshape(N*n_planes, M, 3)
+    inv_planes = torch.linalg.inv(planes).unsqueeze(0).expand(N, -1, -1, -1).reshape(N*n_planes, 3, 3)
+    projections = torch.bmm(coordinates, inv_planes)
+    return projections[..., :2]
+def sample_from_planes(plane_features, coordinates, mode='bilinear', padding_mode='zeros', box_warp=2, interpolate_feat: Optional[str] = 'None'):
+    assert padding_mode == 'zeros'
+    N, n_planes, C, H, W = plane_features.shape
+    _, M, _ = coordinates.shape
+    plane_features = plane_features.view(N*n_planes, C, H, W)
+    coordinates = (2/box_warp) * coordinates # add specific box bounds
+    if interpolate_feat in [None, "v1"]:
+        projected_coordinates = project_onto_planes(planes.to(coordinates), coordinates).unsqueeze(1)
+        output_features = grid_sample(plane_features, projected_coordinates.float())
+        output_features = output_features.permute(0, 3, 2, 1).reshape(N, n_planes, M, C)
+        output_features = output_features.sum(dim=1, keepdim=True).reshape(N, M, C)
+    elif interpolate_feat in ["v2"]:
+        projected_coordinates = project_onto_planes(planes.to(coordinates), coordinates).unsqueeze(1)
+        output_features = grid_sample(plane_features, projected_coordinates.float())
+        output_features = output_features.permute(0, 3, 2, 1).reshape(N, n_planes, M, C)
+        output_features = output_features.permute(0, 2, 1, 3).reshape(N, M, n_planes*C)
+    return output_features.contiguous()

triplaneturbo_executable/utils/mesh.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import Any, Dict, Optional, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from jaxtyping import Float, Integer
+from torch import Tensor
+def dot(x, y):
+    return torch.sum(x * y, -1, keepdim=True)
+class Mesh:
+    def __init__(
+        self, v_pos: Float[Tensor, "Nv 3"], t_pos_idx: Integer[Tensor, "Nf 3"], **kwargs
+    ) -> None:
+        self.v_pos: Float[Tensor, "Nv 3"] = v_pos
+        self.t_pos_idx: Integer[Tensor, "Nf 3"] = t_pos_idx
+        self._v_nrm: Optional[Float[Tensor, "Nv 3"]] = None
+        self._v_tng: Optional[Float[Tensor, "Nv 3"]] = None
+        self._v_tex: Optional[Float[Tensor, "Nt 3"]] = None
+        self._t_tex_idx: Optional[Float[Tensor, "Nf 3"]] = None
+        self._v_rgb: Optional[Float[Tensor, "Nv 3"]] = None
+        self._edges: Optional[Integer[Tensor, "Ne 2"]] = None
+        self.extras: Dict[str, Any] = {}
+        for k, v in kwargs.items():
+            self.add_extra(k, v)
+    def add_extra(self, k, v) -> None:
+        self.extras[k] = v
+    def remove_outlier(self, outlier_n_faces_threshold: Union[int, float]):
+        # use trimesh to first split the mesh into connected components
+        # then remove the components with less than n_face_threshold faces
+        import trimesh
+        # construct a trimesh object
+        mesh = trimesh.Trimesh(
+            vertices=self.v_pos.detach().cpu().numpy(),
+            faces=self.t_pos_idx.detach().cpu().numpy(),
+        )
+        # split the mesh into connected components
+        components = mesh.split(only_watertight=False)
+        n_faces_threshold: int
+        if isinstance(outlier_n_faces_threshold, float):
+            # set the threshold to the number of faces in the largest component multiplied by outlier_n_faces_threshold
+            n_faces_threshold = int(
+                max([c.faces.shape[0] for c in components]) * outlier_n_faces_threshold
+            )
+        else:
+            # set the threshold directly to outlier_n_faces_threshold
+            n_faces_threshold = outlier_n_faces_threshold
+        # remove the components with less than n_face_threshold faces
+        components = [c for c in components if c.faces.shape[0] >= n_faces_threshold]
+        # merge the components
+        mesh = trimesh.util.concatenate(components)
+        # convert back to our mesh format
+        v_pos = torch.from_numpy(mesh.vertices).to(self.v_pos)
+        t_pos_idx = torch.from_numpy(mesh.faces).to(self.t_pos_idx)
+        clean_mesh = Mesh(v_pos, t_pos_idx)
+        # keep the extras unchanged
+        return clean_mesh
+    @property
+    def requires_grad(self):
+        return self.v_pos.requires_grad
+    @property
+    def v_nrm(self):
+        if self._v_nrm is None:
+            self._v_nrm = self._compute_vertex_normal()
+        return self._v_nrm
+    @property
+    def v_tng(self):
+        if self._v_tng is None:
+            self._v_tng = self._compute_vertex_tangent()
+        return self._v_tng
+    @property
+    def v_tex(self):
+        if self._v_tex is None:
+            self._v_tex, self._t_tex_idx = self._unwrap_uv()
+        return self._v_tex
+    @property
+    def t_tex_idx(self):
+        if self._t_tex_idx is None:
+            self._v_tex, self._t_tex_idx = self._unwrap_uv()
+        return self._t_tex_idx
+    @property
+    def v_rgb(self):
+        return self._v_rgb
+    @property
+    def edges(self):
+        if self._edges is None:
+            self._edges = self._compute_edges()
+        return self._edges
+    def _compute_vertex_normal(self):
+        i0 = self.t_pos_idx[:, 0]
+        i1 = self.t_pos_idx[:, 1]
+        i2 = self.t_pos_idx[:, 2]
+        v0 = self.v_pos[i0, :]
+        v1 = self.v_pos[i1, :]
+        v2 = self.v_pos[i2, :]
+        face_normals = torch.cross(v1 - v0, v2 - v0)
+        # Splat face normals to vertices
+        v_nrm = torch.zeros_like(self.v_pos)
+        v_nrm.scatter_add_(0, i0[:, None].repeat(1, 3), face_normals)
+        v_nrm.scatter_add_(0, i1[:, None].repeat(1, 3), face_normals)
+        v_nrm.scatter_add_(0, i2[:, None].repeat(1, 3), face_normals)
+        # Normalize, replace zero (degenerated) normals with some default value
+        v_nrm = torch.where(
+            dot(v_nrm, v_nrm) > 1e-20, v_nrm, torch.as_tensor([0.0, 0.0, 1.0]).to(v_nrm)
+        )
+        v_nrm = F.normalize(v_nrm, dim=1)
+        if torch.is_anomaly_enabled():
+            assert torch.all(torch.isfinite(v_nrm))
+        return v_nrm
+    def _compute_vertex_tangent(self):
+        vn_idx = [None] * 3
+        pos = [None] * 3
+        tex = [None] * 3
+        for i in range(0, 3):
+            pos[i] = self.v_pos[self.t_pos_idx[:, i]]
+            tex[i] = self.v_tex[self.t_tex_idx[:, i]]
+            # t_nrm_idx is always the same as t_pos_idx
+            vn_idx[i] = self.t_pos_idx[:, i]
+        tangents = torch.zeros_like(self.v_nrm)
+        tansum = torch.zeros_like(self.v_nrm)
+        # Compute tangent space for each triangle
+        uve1 = tex[1] - tex[0]
+        uve2 = tex[2] - tex[0]
+        pe1 = pos[1] - pos[0]
+        pe2 = pos[2] - pos[0]
+        nom = pe1 * uve2[..., 1:2] - pe2 * uve1[..., 1:2]
+        denom = uve1[..., 0:1] * uve2[..., 1:2] - uve1[..., 1:2] * uve2[..., 0:1]
+        # Avoid division by zero for degenerated texture coordinates
+        tang = nom / torch.where(
+            denom > 0.0, torch.clamp(denom, min=1e-6), torch.clamp(denom, max=-1e-6)
+        )
+        # Update all 3 vertices
+        for i in range(0, 3):
+            idx = vn_idx[i][:, None].repeat(1, 3)
+            tangents.scatter_add_(0, idx, tang)  # tangents[n_i] = tangents[n_i] + tang
+            tansum.scatter_add_(
+                0, idx, torch.ones_like(tang)
+            )  # tansum[n_i] = tansum[n_i] + 1
+        tangents = tangents / tansum
+        # Normalize and make sure tangent is perpendicular to normal
+        tangents = F.normalize(tangents, dim=1)
+        tangents = F.normalize(tangents - dot(tangents, self.v_nrm) * self.v_nrm)
+        if torch.is_anomaly_enabled():
+            assert torch.all(torch.isfinite(tangents))
+        return tangents
+    def _unwrap_uv(
+        self, xatlas_chart_options: dict = {}, xatlas_pack_options: dict = {}
+    ):
+        import xatlas
+        atlas = xatlas.Atlas()
+        atlas.add_mesh(
+            self.v_pos.detach().cpu().numpy(),
+            self.t_pos_idx.cpu().numpy(),
+        )
+        co = xatlas.ChartOptions()
+        po = xatlas.PackOptions()
+        for k, v in xatlas_chart_options.items():
+            setattr(co, k, v)
+        for k, v in xatlas_pack_options.items():
+            setattr(po, k, v)
+        atlas.generate(co, po)
+        vmapping, indices, uvs = atlas.get_mesh(0)
+        vmapping = (
+            torch.from_numpy(
+                vmapping.astype(np.uint64, casting="same_kind").view(np.int64)
+            )
+            .to(self.v_pos.device)
+            .long()
+        )
+        uvs = torch.from_numpy(uvs).to(self.v_pos.device).float()
+        indices = (
+            torch.from_numpy(
+                indices.astype(np.uint64, casting="same_kind").view(np.int64)
+            )
+            .to(self.v_pos.device)
+            .long()
+        )
+        return uvs, indices
+    def unwrap_uv(
+        self, xatlas_chart_options: dict = {}, xatlas_pack_options: dict = {}
+    ):
+        self._v_tex, self._t_tex_idx = self._unwrap_uv(
+            xatlas_chart_options, xatlas_pack_options
+        )
+    def set_vertex_color(self, v_rgb):
+        assert v_rgb.shape[0] == self.v_pos.shape[0]
+        self._v_rgb = v_rgb
+    def _compute_edges(self):
+        # Compute edges
+        edges = torch.cat(
+            [
+                self.t_pos_idx[:, [0, 1]],
+                self.t_pos_idx[:, [1, 2]],
+                self.t_pos_idx[:, [2, 0]],
+            ],
+            dim=0,
+        )
+        edges = edges.sort()[0]
+        edges = torch.unique(edges, dim=0)
+        return edges
+    def normal_consistency(self) -> Float[Tensor, ""]:
+        edge_nrm: Float[Tensor, "Ne 2 3"] = self.v_nrm[self.edges]
+        nc = (
+            1.0 - torch.cosine_similarity(edge_nrm[:, 0], edge_nrm[:, 1], dim=-1)
+        ).mean()
+        return nc
+    def _laplacian_uniform(self):
+        # from stable-dreamfusion
+        # https://github.com/ashawkey/stable-dreamfusion/blob/8fb3613e9e4cd1ded1066b46e80ca801dfb9fd06/nerf/renderer.py#L224
+        verts, faces = self.v_pos, self.t_pos_idx
+        V = verts.shape[0]
+        F = faces.shape[0]
+        # Neighbor indices
+        ii = faces[:, [1, 2, 0]].flatten()
+        jj = faces[:, [2, 0, 1]].flatten()
+        adj = torch.stack([torch.cat([ii, jj]), torch.cat([jj, ii])], dim=0).unique(
+            dim=1
+        )
+        adj_values = torch.ones(adj.shape[1]).to(verts)
+        # Diagonal indices
+        diag_idx = adj[0]
+        # Build the sparse matrix
+        idx = torch.cat((adj, torch.stack((diag_idx, diag_idx), dim=0)), dim=1)
+        values = torch.cat((-adj_values, adj_values))
+        # The coalesce operation sums the duplicate indices, resulting in the
+        # correct diagonal
+        return torch.sparse_coo_tensor(idx, values, (V, V)).coalesce()
+    def laplacian(self) -> Float[Tensor, ""]:
+        with torch.no_grad():
+            L = self._laplacian_uniform()
+        loss = L.mm(self.v_pos)
+        loss = loss.norm(dim=1)
+        loss = loss.mean()
+        return loss

triplaneturbo_executable/utils/mesh_exporter.py ADDED Viewed

	@@ -0,0 +1,231 @@

+from typing import Callable, Dict, List, Optional, Tuple, Any
+from jaxtyping import Float
+from torch import Tensor
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import os
+import numpy as np
+from .saving import SaverMixin
+from ..utils.mesh import Mesh
+from ..utils.general_utils import scale_tensor
+@dataclass
+class ExporterOutput:
+    save_name: str
+    save_type: str
+    params: Dict[str, Any]
+class IsosurfaceHelper(nn.Module):
+    points_range: Tuple[float, float] = (0, 1)
+    @property
+    def grid_vertices(self) -> Float[Tensor, "N 3"]:
+        raise NotImplementedError
+class DiffMarchingCubeHelper(IsosurfaceHelper):
+    def __init__(
+            self,
+            resolution: int,
+            point_range: Tuple[float, float] = (0, 1)
+        ) -> None:
+        super().__init__()
+        self.resolution = resolution
+        self.points_range = point_range
+        from diso import DiffMC
+        self.mc_func: Callable = DiffMC(dtype=torch.float32)
+        self._grid_vertices: Optional[Float[Tensor, "N3 3"]] = None
+        self._dummy: Float[Tensor, "..."]
+        self.register_buffer(
+            "_dummy", torch.zeros(0, dtype=torch.float32), persistent=False
+        )
+    @property
+    def grid_vertices(self) -> Float[Tensor, "N3 3"]:
+        if self._grid_vertices is None:
+            # keep the vertices on CPU so that we can support very large resolution
+            x, y, z = (
+                torch.linspace(*self.points_range, self.resolution),
+                torch.linspace(*self.points_range, self.resolution),
+                torch.linspace(*self.points_range, self.resolution),
+            )
+            x, y, z = torch.meshgrid(x, y, z, indexing="ij")
+            verts = torch.stack([x, y, z], dim=-1).reshape(-1, 3)
+            verts = verts * (self.points_range[1] - self.points_range[0]) + self.points_range[0]
+            self._grid_vertices = verts
+        return self._grid_vertices
+    def forward(
+        self,
+        level: Float[Tensor, "N3 1"],
+        deformation: Optional[Float[Tensor, "N3 3"]] = None,
+        isovalue=0.0,
+    ) -> Mesh:
+        level = level.view(self.resolution, self.resolution, self.resolution)
+        if deformation is not None:
+            deformation = deformation.view(self.resolution, self.resolution, self.resolution, 3)
+        v_pos, t_pos_idx = self.mc_func(level, deformation, isovalue=isovalue)
+        v_pos = v_pos * (self.points_range[1] - self.points_range[0]) + self.points_range[0]
+        # TODO: if the mesh is good
+        return Mesh(v_pos=v_pos, t_pos_idx=t_pos_idx)
+def isosurface(
+        space_cache: Float[Tensor, "B ..."],
+        forward_field: Callable,
+        isosurface_helper: Callable,
+    ) -> List[Mesh]:
+    # the isosurface is dependent on the space cache
+    # randomly detach isosurface method if it is differentiable
+    # get the batchsize
+    if torch.is_tensor(space_cache): #space cache
+        batch_size = space_cache.shape[0]
+    elif isinstance(space_cache, Dict): #hyper net
+        # Dict[str, List[Float[Tensor, "B ..."]]]
+        for key in space_cache.keys():
+            batch_size = space_cache[key][0].shape[0]
+            break
+    # scale the points to [-1, 1]
+    points = scale_tensor(
+        isosurface_helper.grid_vertices.to(space_cache.device),
+        isosurface_helper.points_range,
+        [-1, 1], # hard coded isosurface_bbox
+    )
+    # get the sdf values
+    sdf_batch, deformation_batch = forward_field(
+        points[None, ...].expand(batch_size, -1, -1),
+        space_cache
+    )
+    # get the isosurface
+    mesh_list = []
+    # check if the sdf is empty
+    # for sdf, deformation in zip(sdf_batch, deformation_batch):
+    for index in range(sdf_batch.shape[0]):
+        sdf = sdf_batch[index]
+        # the deformation may be None
+        if deformation_batch is None:
+            deformation = None
+        else:
+            deformation = deformation_batch[index]
+        # special case when all sdf values are positive or negative, thus no isosurface
+        if torch.all(sdf > 0) or torch.all(sdf < 0):
+            print(f"All sdf values are positive or negative, no isosurface")
+            sdf = torch.norm(points, dim=-1) - 1
+        mesh = isosurface_helper(sdf, deformation)
+        mesh.v_pos = scale_tensor(
+            mesh.v_pos,
+            isosurface_helper.points_range,
+            [-1, 1], # hard coded isosurface_bbox
+        )
+        # TODO: implement outlier removal
+        # if cfg.isosurface_remove_outliers:
+        #     mesh = mesh.remove_outlier(cfg.isosurface_outlier_n_faces_threshold)
+        mesh_list.append(mesh)
+    return mesh_list
+def colorize_mesh(
+    space_cache: Any,
+    export_fn: Callable,
+    mesh_list: List[Mesh],
+    activation: Callable,
+) -> List[Mesh]:
+    """Colorize the mesh using the geometry's export function and space cache.
+    Args:
+        space_cache: The space cache containing feature information
+        export_fn: The export function from geometry that generates features
+        mesh_list: List of meshes to colorize
+    Returns:
+        List[Mesh]: List of colorized meshes
+    """
+    # Process each mesh in the batch
+    for i, mesh in enumerate(mesh_list):
+        # Get vertex positions
+        points = mesh.v_pos[None, ...]  # Add batch dimension [1, N, 3]
+        # Get the corresponding space cache slice for this mesh
+        if torch.is_tensor(space_cache):
+            space_cache_slice = space_cache[i:i+1]
+        elif isinstance(space_cache, dict):
+            space_cache_slice = {}
+            for key in space_cache.keys():
+                space_cache_slice[key] = [
+                    weight[i:i+1] for weight in space_cache[key]
+                ]
+        # Export features for the vertices
+        out = export_fn(points, space_cache_slice)
+        # Update vertex colors if features exist
+        if "features" in out:
+            features = out["features"].squeeze(0)  # Remove batch dim [N, C]
+            # Convert features to RGB colors
+            mesh._v_rgb = activation(features)  # Access private attribute directly
+    return mesh_list
+class MeshExporter(SaverMixin):
+    def __init__(self, save_dir="outputs"):
+        self.save_dir = save_dir
+        os.makedirs(save_dir, exist_ok=True)
+    def get_save_dir(self):
+        return self.save_dir
+    def get_save_path(self, filename):
+        return os.path.join(self.save_dir, filename)
+    def convert_data(self, x):
+        if isinstance(x, torch.Tensor):
+            return x.detach().cpu().numpy()
+        return x
+def export_obj(
+        mesh: Mesh,
+        save_path: str,
+        save_normal: bool = False,
+    ) -> List[str]:
+    """
+    Export mesh data to OBJ file format.
+    Args:
+        mesh_data: Dictionary containing mesh data (vertices, faces, etc.)
+        save_path: Path to save the OBJ file
+    Returns:
+        List of saved file paths
+    """
+    # Create exporter
+    exporter = MeshExporter(os.path.dirname(save_path))
+    # Export mesh
+    save_paths = exporter.save_obj(
+        os.path.basename(save_path),
+        mesh,
+        save_mat=None,
+        save_normal=save_normal and mesh.v_nrm is not None,
+        save_uv=False,
+        save_vertex_color=mesh.v_rgb is not None,
+    )
+    return save_paths

triplaneturbo_executable/utils/saving.py ADDED Viewed

	@@ -0,0 +1,754 @@

+import json
+import os
+import re
+import shutil
+import cv2
+import imageio
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import trimesh
+import wandb
+from matplotlib import cm
+from matplotlib.colors import LinearSegmentedColormap
+from PIL import Image, ImageDraw
+from pytorch_lightning.loggers import WandbLogger
+from ..utils.mesh import Mesh
+from typing import Dict, List, Optional, Union, Any
+from omegaconf import DictConfig
+from jaxtyping import Float
+from torch import Tensor
+import threading
+class SaverMixin:
+    _save_dir: Optional[str] = None
+    _wandb_logger: Optional[WandbLogger] = None
+    def set_save_dir(self, save_dir: str):
+        self._save_dir = save_dir
+    def get_save_dir(self):
+        if self._save_dir is None:
+            raise ValueError("Save dir is not set")
+        return self._save_dir
+    def convert_data(self, data):
+        if data is None:
+            return None
+        elif isinstance(data, np.ndarray):
+            return data
+        elif isinstance(data, torch.Tensor):
+            return data.detach().cpu().numpy()
+        elif isinstance(data, list):
+            return [self.convert_data(d) for d in data]
+        elif isinstance(data, dict):
+            return {k: self.convert_data(v) for k, v in data.items()}
+        else:
+            raise TypeError(
+                "Data must be in type numpy.ndarray, torch.Tensor, list or dict, getting",
+                type(data),
+            )
+    def get_save_path(self, filename):
+        save_path = os.path.join(self.get_save_dir(), filename)
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        return save_path
+    def create_loggers(self, cfg_loggers: DictConfig) -> None:
+        if "wandb" in cfg_loggers.keys() and cfg_loggers.wandb.enable:
+            self._wandb_logger = WandbLogger(
+                project=cfg_loggers.wandb.project, name=cfg_loggers.wandb.name
+            )
+    def get_loggers(self) -> List:
+        if self._wandb_logger:
+            return [self._wandb_logger]
+        else:
+            return []
+    DEFAULT_RGB_KWARGS = {"data_format": "HWC", "data_range": (0, 1)}
+    DEFAULT_UV_KWARGS = {
+        "data_format": "HWC",
+        "data_range": (0, 1),
+        "cmap": "checkerboard",
+    }
+    DEFAULT_GRAYSCALE_KWARGS = {"data_range": None, "cmap": "jet"}
+    DEFAULT_GRID_KWARGS = {"align": "max"}
+    def get_rgb_image_(self, img, data_format, data_range, rgba=False):
+        img = self.convert_data(img)
+        assert data_format in ["CHW", "HWC"]
+        if data_format == "CHW":
+            img = img.transpose(1, 2, 0)
+        if img.dtype != np.uint8:
+            img = img.clip(min=data_range[0], max=data_range[1])
+            img = (
+                (img - data_range[0]) / (data_range[1] - data_range[0]) * 255.0
+            ).astype(np.uint8)
+        nc = 4 if rgba else 3
+        imgs = [img[..., start : start + nc] for start in range(0, img.shape[-1], nc)]
+        imgs = [
+            img_
+            if img_.shape[-1] == nc
+            else np.concatenate(
+                [
+                    img_,
+                    np.zeros(
+                        (img_.shape[0], img_.shape[1], nc - img_.shape[2]),
+                        dtype=img_.dtype,
+                    ),
+                ],
+                axis=-1,
+            )
+            for img_ in imgs
+        ]
+        img = np.concatenate(imgs, axis=1)
+        if rgba:
+            img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGRA)
+        else:
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        return img
+    def _save_rgb_image(
+        self,
+        filename,
+        img,
+        data_format,
+        data_range,
+        name: Optional[str] = None,
+        step: Optional[int] = None,
+    ):
+        img = self.get_rgb_image_(img, data_format, data_range)
+        cv2.imwrite(filename, img)
+        if name and self._wandb_logger:
+            wandb.log(
+                {
+                    name: wandb.Image(self.get_save_path(filename)),
+                    "trainer/global_step": step,
+                }
+            )
+    def save_rgb_image(
+        self,
+        filename,
+        img,
+        data_format=DEFAULT_RGB_KWARGS["data_format"],
+        data_range=DEFAULT_RGB_KWARGS["data_range"],
+        name: Optional[str] = None,
+        step: Optional[int] = None,
+    ) -> str:
+        save_path = self.get_save_path(filename)
+        self._save_rgb_image(save_path, img, data_format, data_range, name, step)
+        return save_path
+    def get_uv_image_(self, img, data_format, data_range, cmap):
+        img = self.convert_data(img)
+        assert data_format in ["CHW", "HWC"]
+        if data_format == "CHW":
+            img = img.transpose(1, 2, 0)
+        img = img.clip(min=data_range[0], max=data_range[1])
+        img = (img - data_range[0]) / (data_range[1] - data_range[0])
+        assert cmap in ["checkerboard", "color"]
+        if cmap == "checkerboard":
+            n_grid = 64
+            mask = (img * n_grid).astype(int)
+            mask = (mask[..., 0] + mask[..., 1]) % 2 == 0
+            img = np.ones((img.shape[0], img.shape[1], 3), dtype=np.uint8) * 255
+            img[mask] = np.array([255, 0, 255], dtype=np.uint8)
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        elif cmap == "color":
+            img_ = np.zeros((img.shape[0], img.shape[1], 3), dtype=np.uint8)
+            img_[..., 0] = (img[..., 0] * 255).astype(np.uint8)
+            img_[..., 1] = (img[..., 1] * 255).astype(np.uint8)
+            img_ = cv2.cvtColor(img_, cv2.COLOR_RGB2BGR)
+            img = img_
+        return img
+    def save_uv_image(
+        self,
+        filename,
+        img,
+        data_format=DEFAULT_UV_KWARGS["data_format"],
+        data_range=DEFAULT_UV_KWARGS["data_range"],
+        cmap=DEFAULT_UV_KWARGS["cmap"],
+    ) -> str:
+        save_path = self.get_save_path(filename)
+        img = self.get_uv_image_(img, data_format, data_range, cmap)
+        cv2.imwrite(save_path, img)
+        return save_path
+    def get_grayscale_image_(self, img, data_range, cmap):
+        img = self.convert_data(img)
+        img = np.nan_to_num(img)
+        if data_range is None:
+            img = (img - img.min()) / (img.max() - img.min())
+        else:
+            img = img.clip(data_range[0], data_range[1])
+            img = (img - data_range[0]) / (data_range[1] - data_range[0])
+        assert cmap in [None, "jet", "magma", "spectral"]
+        if cmap == None:
+            img = (img * 255.0).astype(np.uint8)
+            img = np.repeat(img[..., None], 3, axis=2)
+        elif cmap == "jet":
+            img = (img * 255.0).astype(np.uint8)
+            img = cv2.applyColorMap(img, cv2.COLORMAP_JET)
+        elif cmap == "magma":
+            img = 1.0 - img
+            base = cm.get_cmap("magma")
+            num_bins = 256
+            colormap = LinearSegmentedColormap.from_list(
+                f"{base.name}{num_bins}", base(np.linspace(0, 1, num_bins)), num_bins
+            )(np.linspace(0, 1, num_bins))[:, :3]
+            a = np.floor(img * 255.0)
+            b = (a + 1).clip(max=255.0)
+            f = img * 255.0 - a
+            a = a.astype(np.uint16).clip(0, 255)
+            b = b.astype(np.uint16).clip(0, 255)
+            img = colormap[a] + (colormap[b] - colormap[a]) * f[..., None]
+            img = (img * 255.0).astype(np.uint8)
+        elif cmap == "spectral":
+            colormap = plt.get_cmap("Spectral")
+            def blend_rgba(image):
+                image = image[..., :3] * image[..., -1:] + (
+                    1.0 - image[..., -1:]
+                )  # blend A to RGB
+                return image
+            img = colormap(img)
+            img = blend_rgba(img)
+            img = (img * 255).astype(np.uint8)
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        return img
+    def _save_grayscale_image(
+        self,
+        filename,
+        img,
+        data_range,
+        cmap,
+        name: Optional[str] = None,
+        step: Optional[int] = None,
+    ):
+        img = self.get_grayscale_image_(img, data_range, cmap)
+        cv2.imwrite(filename, img)
+        if name and self._wandb_logger:
+            wandb.log(
+                {
+                    name: wandb.Image(self.get_save_path(filename)),
+                    "trainer/global_step": step,
+                }
+            )
+    def save_grayscale_image(
+        self,
+        filename,
+        img,
+        data_range=DEFAULT_GRAYSCALE_KWARGS["data_range"],
+        cmap=DEFAULT_GRAYSCALE_KWARGS["cmap"],
+        name: Optional[str] = None,
+        step: Optional[int] = None,
+    ) -> str:
+        save_path = self.get_save_path(filename)
+        self._save_grayscale_image(save_path, img, data_range, cmap, name, step)
+        return save_path
+    def get_image_grid_(self, imgs, align):
+        if isinstance(imgs[0], list):
+            return np.concatenate(
+                [self.get_image_grid_(row, align) for row in imgs], axis=0
+            )
+        cols = []
+        for col in imgs:
+            assert col["type"] in ["rgb", "uv", "grayscale"]
+            if col["type"] == "rgb":
+                rgb_kwargs = self.DEFAULT_RGB_KWARGS.copy()
+                rgb_kwargs.update(col["kwargs"])
+                cols.append(self.get_rgb_image_(col["img"], **rgb_kwargs))
+            elif col["type"] == "uv":
+                uv_kwargs = self.DEFAULT_UV_KWARGS.copy()
+                uv_kwargs.update(col["kwargs"])
+                cols.append(self.get_uv_image_(col["img"], **uv_kwargs))
+            elif col["type"] == "grayscale":
+                grayscale_kwargs = self.DEFAULT_GRAYSCALE_KWARGS.copy()
+                grayscale_kwargs.update(col["kwargs"])
+                cols.append(self.get_grayscale_image_(col["img"], **grayscale_kwargs))
+        if align == "max":
+            h = max([col.shape[0] for col in cols])
+            w = max([col.shape[1] for col in cols])
+        elif align == "min":
+            h = min([col.shape[0] for col in cols])
+            w = min([col.shape[1] for col in cols])
+        elif isinstance(align, int):
+            h = align
+            w = align
+        elif (
+            isinstance(align, tuple)
+            and isinstance(align[0], int)
+            and isinstance(align[1], int)
+        ):
+            h, w = align
+        else:
+            raise ValueError(
+                f"Unsupported image grid align: {align}, should be min, max, int or (int, int)"
+            )
+        for i in range(len(cols)):
+            if cols[i].shape[0] != h or cols[i].shape[1] != w:
+                cols[i] = cv2.resize(cols[i], (w, h), interpolation=cv2.INTER_LINEAR)
+        return np.concatenate(cols, axis=1)
+    def save_image_grid(
+        self,
+        filename,
+        imgs,
+        align=DEFAULT_GRID_KWARGS["align"],
+        name: Optional[str] = None,
+        step: Optional[int] = None,
+        texts: Optional[List[float]] = None,
+    ):
+        save_path = self.get_save_path(filename)
+        img = self.get_image_grid_(imgs, align=align)
+        if texts is not None:
+            img = Image.fromarray(img)
+            draw = ImageDraw.Draw(img)
+            black, white = (0, 0, 0), (255, 255, 255)
+            for i, text in enumerate(texts):
+                draw.text((2, (img.size[1] // len(texts)) * i + 1), f"{text}", white)
+                draw.text((0, (img.size[1] // len(texts)) * i + 1), f"{text}", white)
+                draw.text((2, (img.size[1] // len(texts)) * i - 1), f"{text}", white)
+                draw.text((0, (img.size[1] // len(texts)) * i - 1), f"{text}", white)
+                draw.text((1, (img.size[1] // len(texts)) * i), f"{text}", black)
+            img = np.asarray(img)
+        cv2.imwrite(save_path, img)
+        if name and self._wandb_logger:
+            wandb.log({name: wandb.Image(save_path), "trainer/global_step": step})
+        return save_path
+    def save_image(self, filename, img) -> str:
+        save_path = self.get_save_path(filename)
+        img = self.convert_data(img)
+        assert img.dtype == np.uint8 or img.dtype == np.uint16
+        if img.ndim == 3 and img.shape[-1] == 3:
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        elif img.ndim == 3 and img.shape[-1] == 4:
+            img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGRA)
+        cv2.imwrite(save_path, img)
+        return save_path
+    def save_cubemap(self, filename, img, data_range=(0, 1), rgba=False) -> str:
+        save_path = self.get_save_path(filename)
+        img = self.convert_data(img)
+        assert img.ndim == 4 and img.shape[0] == 6 and img.shape[1] == img.shape[2]
+        imgs_full = []
+        for start in range(0, img.shape[-1], 3):
+            img_ = img[..., start : start + 3]
+            img_ = np.stack(
+                [
+                    self.get_rgb_image_(img_[i], "HWC", data_range, rgba=rgba)
+                    for i in range(img_.shape[0])
+                ],
+                axis=0,
+            )
+            size = img_.shape[1]
+            placeholder = np.zeros((size, size, 3), dtype=np.float32)
+            img_full = np.concatenate(
+                [
+                    np.concatenate(
+                        [placeholder, img_[2], placeholder, placeholder], axis=1
+                    ),
+                    np.concatenate([img_[1], img_[4], img_[0], img_[5]], axis=1),
+                    np.concatenate(
+                        [placeholder, img_[3], placeholder, placeholder], axis=1
+                    ),
+                ],
+                axis=0,
+            )
+            imgs_full.append(img_full)
+        imgs_full = np.concatenate(imgs_full, axis=1)
+        cv2.imwrite(save_path, imgs_full)
+        return save_path
+    def save_data(self, filename, data) -> str:
+        data = self.convert_data(data)
+        if isinstance(data, dict):
+            if not filename.endswith(".npz"):
+                filename += ".npz"
+            save_path = self.get_save_path(filename)
+            np.savez(save_path, **data)
+        else:
+            if not filename.endswith(".npy"):
+                filename += ".npy"
+            save_path = self.get_save_path(filename)
+            np.save(save_path, data)
+        return save_path
+    def save_state_dict(self, filename, data) -> str:
+        save_path = self.get_save_path(filename)
+        torch.save(data, save_path)
+        return save_path
+    # def save_img_sequence(
+    #     self,
+    #     filename,
+    #     img_dir,
+    #     matcher,
+    #     save_format="mp4",
+    #     fps=30,
+    #     name: Optional[str] = None,
+    #     step: Optional[int] = None,
+    # ) -> str:
+    #     assert save_format in ["gif", "mp4"]
+    #     if not filename.endswith(save_format):
+    #         filename += f".{save_format}"
+    #     save_path = self.get_save_path(filename)
+    #     matcher = re.compile(matcher)
+    #     img_dir = os.path.join(self.get_save_dir(), img_dir)
+    #     imgs = []
+    #     for f in os.listdir(img_dir):
+    #         if matcher.search(f):
+    #             imgs.append(f)
+    #     imgs = sorted(imgs, key=lambda f: int(matcher.search(f).groups()[0]))
+    #     imgs = [cv2.imread(os.path.join(img_dir, f)) for f in imgs]
+    #     if save_format == "gif":
+    #         imgs = [cv2.cvtColor(i, cv2.COLOR_BGR2RGB) for i in imgs]
+    #         imageio.mimsave(save_path, imgs, fps=fps, palettesize=256)
+    #     elif save_format == "mp4":
+    #         imgs = [cv2.cvtColor(i, cv2.COLOR_BGR2RGB) for i in imgs]
+    #         imageio.mimsave(save_path, imgs, fps=fps)
+    #     if name and self._wandb_logger:
+    #         wandb.log(
+    #             {
+    #                 name: wandb.Video(save_path, format="mp4"),
+    #                 "trainer/global_step": step,
+    #             }
+    #         )
+    #     return save_path
+    def save_img_sequence(
+        self,
+        filename,
+        img_dir,
+        matcher,
+        save_format="mp4",
+        fps=30,
+        name: Optional[str] = None,
+        step: Optional[int] = None,
+        multithreaded: bool = False
+    ) -> str:
+        assert save_format in ["gif", "mp4"]
+        if not filename.endswith(save_format):
+            filename += f".{save_format}"
+        save_path = self.get_save_path(filename)
+        matcher = re.compile(matcher)
+        img_dir = os.path.join(self.get_save_dir(), img_dir)
+        imgs = []
+        for f in os.listdir(img_dir):
+            if matcher.search(f):
+                imgs.append(f)
+        imgs = sorted(imgs, key=lambda f: int(matcher.search(f).groups()[0]))
+        imgs = [cv2.imread(os.path.join(img_dir, f)) for f in imgs]
+        if save_format == "gif":
+            imgs = [cv2.cvtColor(i, cv2.COLOR_BGR2RGB) for i in imgs]
+            if multithreaded:
+                # threestudio.info("Multithreaded gif saving: {}".format(save_path))
+                thread = threading.Thread(target=imageio.mimsave, args=(save_path, imgs), kwargs={"fps": fps})
+                thread.start()
+            else:
+                imageio.mimsave(save_path, imgs, fps=fps, palettesize=256)
+        elif save_format == "mp4":
+            imgs = [cv2.cvtColor(i, cv2.COLOR_BGR2RGB) for i in imgs]
+            if multithreaded:
+                # threestudio.info("Multithreaded mp4 saving: {}".format(save_path))
+                thread = threading.Thread(target=imageio.mimsave, args=(save_path, imgs), kwargs={"fps": fps})
+                thread.start()
+            else:
+                imageio.mimsave(save_path, imgs, fps=fps)
+        if name and self._wandb_logger:
+            wandb.log(
+                {
+                    name: wandb.Video(save_path, format="mp4"),
+                    "trainer/global_step": step,
+                }
+            )
+        return save_path
+    def save_mesh(self, filename, v_pos, t_pos_idx, v_tex=None, t_tex_idx=None) -> str:
+        save_path = self.get_save_path(filename)
+        v_pos = self.convert_data(v_pos)
+        t_pos_idx = self.convert_data(t_pos_idx)
+        mesh = trimesh.Trimesh(vertices=v_pos, faces=t_pos_idx)
+        mesh.export(save_path)
+        return save_path
+    def save_obj(
+        self,
+        filename: str,
+        mesh: Mesh,
+        save_mat: bool = False,
+        save_normal: bool = False,
+        save_uv: bool = False,
+        save_vertex_color: bool = False,
+        map_Kd: Optional[Float[Tensor, "H W 3"]] = None,
+        map_Ks: Optional[Float[Tensor, "H W 3"]] = None,
+        map_Bump: Optional[Float[Tensor, "H W 3"]] = None,
+        map_Pm: Optional[Float[Tensor, "H W 1"]] = None,
+        map_Pr: Optional[Float[Tensor, "H W 1"]] = None,
+        map_format: str = "jpg",
+    ) -> List[str]:
+        if not filename.endswith(".obj"):
+            filename += ".obj"
+        save_path = self.get_save_path(filename)
+        v_pos, t_pos_idx = self.convert_data(mesh.v_pos), self.convert_data(
+            mesh.t_pos_idx
+        )
+        v_nrm, v_tex, t_tex_idx, v_rgb = None, None, None, None
+        if save_normal:
+            v_nrm = self.convert_data(mesh.v_nrm)
+        if save_uv:
+            v_tex, t_tex_idx = self.convert_data(mesh.v_tex), self.convert_data(
+                mesh.t_tex_idx
+            )
+        if save_vertex_color:
+            v_rgb = self.convert_data(mesh.v_rgb)
+        # use trimesh to save obj
+        mesh = trimesh.Trimesh(
+            vertices=v_pos,
+            faces=t_pos_idx,
+            vertex_normals=v_nrm,
+            vertex_colors=v_rgb,
+            visual=trimesh.visual.TextureVisuals(
+                uv=v_tex,
+                face_uv=t_tex_idx
+            ) if save_uv else None
+        )
+        # save the mesh to obj
+        mesh.export(save_path)
+        return [save_path]
+    # def save_obj(
+    #     self,
+    #     filename: str,
+    #     mesh: Mesh,
+    #     save_mat: bool = False,
+    #     save_normal: bool = False,
+    #     save_uv: bool = False,
+    #     save_vertex_color: bool = False,
+    #     map_Kd: Optional[Float[Tensor, "H W 3"]] = None,
+    #     map_Ks: Optional[Float[Tensor, "H W 3"]] = None,
+    #     map_Bump: Optional[Float[Tensor, "H W 3"]] = None,
+    #     map_Pm: Optional[Float[Tensor, "H W 1"]] = None,
+    #     map_Pr: Optional[Float[Tensor, "H W 1"]] = None,
+    #     map_format: str = "jpg",
+    # ) -> List[str]:
+    #     save_paths: List[str] = []
+    #     if not filename.endswith(".obj"):
+    #         filename += ".obj"
+    #     v_pos, t_pos_idx = self.convert_data(mesh.v_pos), self.convert_data(
+    #         mesh.t_pos_idx
+    #     )
+    #     v_nrm, v_tex, t_tex_idx, v_rgb = None, None, None, None
+    #     if save_normal:
+    #         v_nrm = self.convert_data(mesh.v_nrm)
+    #     if save_uv:
+    #         v_tex, t_tex_idx = self.convert_data(mesh.v_tex), self.convert_data(
+    #             mesh.t_tex_idx
+    #         )
+    #     if save_vertex_color:
+    #         v_rgb = self.convert_data(mesh.v_rgb)
+    #     matname, mtllib = None, None
+    #     if save_mat:
+    #         matname = "default"
+    #         mtl_filename = filename.replace(".obj", ".mtl")
+    #         mtllib = os.path.basename(mtl_filename)
+    #         mtl_save_paths = self._save_mtl(
+    #             mtl_filename,
+    #             matname,
+    #             map_Kd=self.convert_data(map_Kd),
+    #             map_Ks=self.convert_data(map_Ks),
+    #             map_Bump=self.convert_data(map_Bump),
+    #             map_Pm=self.convert_data(map_Pm),
+    #             map_Pr=self.convert_data(map_Pr),
+    #             map_format=map_format,
+    #         )
+    #         save_paths += mtl_save_paths
+    #     obj_save_path = self._save_obj(
+    #         filename,
+    #         v_pos,
+    #         t_pos_idx,
+    #         v_nrm=v_nrm,
+    #         v_tex=v_tex,
+    #         t_tex_idx=t_tex_idx,
+    #         v_rgb=v_rgb,
+    #         matname=matname,
+    #         mtllib=mtllib,
+    #     )
+    #     save_paths.append(obj_save_path)
+    #     return save_paths
+    # def _save_obj(
+    #     self,
+    #     filename,
+    #     v_pos,
+    #     t_pos_idx,
+    #     v_nrm=None,
+    #     v_tex=None,
+    #     t_tex_idx=None,
+    #     v_rgb=None,
+    #     matname=None,
+    #     mtllib=None,
+    # ) -> str:
+    #     obj_str = ""
+    #     if matname is not None:
+    #         obj_str += f"mtllib {mtllib}\n"
+    #         obj_str += f"g object\n"
+    #         obj_str += f"usemtl {matname}\n"
+    #     for i in range(len(v_pos)):
+    #         obj_str += f"v {v_pos[i][0]} {v_pos[i][1]} {v_pos[i][2]}"
+    #         if v_rgb is not None:
+    #             obj_str += f" {v_rgb[i][0]} {v_rgb[i][1]} {v_rgb[i][2]}"
+    #         obj_str += "\n"
+    #     if v_nrm is not None:
+    #         for v in v_nrm:
+    #             obj_str += f"vn {v[0]} {v[1]} {v[2]}\n"
+    #     if v_tex is not None:
+    #         for v in v_tex:
+    #             obj_str += f"vt {v[0]} {1.0 - v[1]}\n"
+    #     for i in range(len(t_pos_idx)):
+    #         obj_str += "f"
+    #         for j in range(3):
+    #             obj_str += f" {t_pos_idx[i][j] + 1}/"
+    #             if v_tex is not None:
+    #                 obj_str += f"{t_tex_idx[i][j] + 1}"
+    #             obj_str += "/"
+    #             if v_nrm is not None:
+    #                 obj_str += f"{t_pos_idx[i][j] + 1}"
+    #         obj_str += "\n"
+    #     save_path = self.get_save_path(filename)
+    #     with open(save_path, "w") as f:
+    #         f.write(obj_str)
+    #     return save_path
+    def _save_mtl(
+        self,
+        filename,
+        matname,
+        Ka=(0.0, 0.0, 0.0),
+        Kd=(1.0, 1.0, 1.0),
+        Ks=(0.0, 0.0, 0.0),
+        map_Kd=None,
+        map_Ks=None,
+        map_Bump=None,
+        map_Pm=None,
+        map_Pr=None,
+        map_format="jpg",
+        step: Optional[int] = None,
+    ) -> List[str]:
+        mtl_save_path = self.get_save_path(filename)
+        save_paths = [mtl_save_path]
+        mtl_str = f"newmtl {matname}\n"
+        mtl_str += f"Ka {Ka[0]} {Ka[1]} {Ka[2]}\n"
+        if map_Kd is not None:
+            map_Kd_save_path = os.path.join(
+                os.path.dirname(mtl_save_path), f"texture_kd.{map_format}"
+            )
+            mtl_str += f"map_Kd texture_kd.{map_format}\n"
+            self._save_rgb_image(
+                map_Kd_save_path,
+                map_Kd,
+                data_format="HWC",
+                data_range=(0, 1),
+                name=f"{matname}_Kd",
+                step=step,
+            )
+            save_paths.append(map_Kd_save_path)
+        else:
+            mtl_str += f"Kd {Kd[0]} {Kd[1]} {Kd[2]}\n"
+        if map_Ks is not None:
+            map_Ks_save_path = os.path.join(
+                os.path.dirname(mtl_save_path), f"texture_ks.{map_format}"
+            )
+            mtl_str += f"map_Ks texture_ks.{map_format}\n"
+            self._save_rgb_image(
+                map_Ks_save_path,
+                map_Ks,
+                data_format="HWC",
+                data_range=(0, 1),
+                name=f"{matname}_Ks",
+                step=step,
+            )
+            save_paths.append(map_Ks_save_path)
+        else:
+            mtl_str += f"Ks {Ks[0]} {Ks[1]} {Ks[2]}\n"
+        if map_Bump is not None:
+            map_Bump_save_path = os.path.join(
+                os.path.dirname(mtl_save_path), f"texture_nrm.{map_format}"
+            )
+            mtl_str += f"map_Bump texture_nrm.{map_format}\n"
+            self._save_rgb_image(
+                map_Bump_save_path,
+                map_Bump,
+                data_format="HWC",
+                data_range=(0, 1),
+                name=f"{matname}_Bump",
+                step=step,
+            )
+            save_paths.append(map_Bump_save_path)
+        if map_Pm is not None:
+            map_Pm_save_path = os.path.join(
+                os.path.dirname(mtl_save_path), f"texture_metallic.{map_format}"
+            )
+            mtl_str += f"map_Pm texture_metallic.{map_format}\n"
+            self._save_grayscale_image(
+                map_Pm_save_path,
+                map_Pm,
+                data_range=(0, 1),
+                cmap=None,
+                name=f"{matname}_refl",
+                step=step,
+            )
+            save_paths.append(map_Pm_save_path)
+        if map_Pr is not None:
+            map_Pr_save_path = os.path.join(
+                os.path.dirname(mtl_save_path), f"texture_roughness.{map_format}"
+            )
+            mtl_str += f"map_Pr texture_roughness.{map_format}\n"
+            self._save_grayscale_image(
+                map_Pr_save_path,
+                map_Pr,
+                data_range=(0, 1),
+                cmap=None,
+                name=f"{matname}_Ns",
+                step=step,
+            )
+            save_paths.append(map_Pr_save_path)
+        with open(self.get_save_path(filename), "w") as f:
+            f.write(mtl_str)
+        return save_paths
+    def save_file(self, filename, src_path) -> str:
+        save_path = self.get_save_path(filename)
+        shutil.copyfile(src_path, save_path)
+        return save_path
+    def save_json(self, filename, payload) -> str:
+        save_path = self.get_save_path(filename)
+        with open(save_path, "w") as f:
+            f.write(json.dumps(payload))
+        return save_path