3d

Runtime error

App Files Files Community

jiawei011 commited on Oct 11, 2023

Commit

12b7f59

1 Parent(s): 5f58ec6

init

Browse files

Files changed (40) hide show

.gitattributes +1 -0
.gitignore +20 -0
LICENSE +21 -0
LICENSE_GAUSSIAN_SPLATTING.md +83 -0
README.md +0 -13
app.py +105 -0
cam_utils.py +146 -0
configs/image.yaml +69 -0
configs/text.yaml +68 -0
data/anya_rgba.png +3 -0
data/catstatue_rgba.png +3 -0
data/csm_luigi_rgba.png +3 -0
data/test.png +3 -0
data/zelda_rgba.png +3 -0
grid_put.py +300 -0
gs_renderer.py +820 -0
guidance/sd_utils.py +334 -0
guidance/zero123_utils.py +226 -0
main.py +882 -0
main2.py +671 -0
mesh.py +622 -0
mesh_renderer.py +154 -0
mesh_utils.py +147 -0
process.py +92 -0
readme.md +139 -0
requirements.txt +37 -0
scripts/convert_obj_to_video.py +20 -0
scripts/run.sh +5 -0
scripts/run_sd.sh +31 -0
scripts/runall.py +48 -0
scripts/runall_sd.py +45 -0
sh_utils.py +118 -0
simple-knn/ext.cpp +17 -0
simple-knn/setup.py +35 -0
simple-knn/simple_knn.cu +221 -0
simple-knn/simple_knn.h +21 -0
simple-knn/simple_knn/.gitkeep +0 -0
simple-knn/spatial.cu +26 -0
simple-knn/spatial.h +14 -0
zero123.py +666 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,20 @@

+__pycache__/
+build/
+*.egg-info/
+*.so
+venv_*/
+.vs/
+.vscode/
+.idea/
+tmp_*
+data?
+data??
+scripts2
+model_cache
+logs
+videos
+images
+*.mp4

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 dreamgaussian
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

LICENSE_GAUSSIAN_SPLATTING.md ADDED Viewed

	@@ -0,0 +1,83 @@

+Gaussian-Splatting License
+===========================
+**Inria** and **the Max Planck Institut for Informatik (MPII)** hold all the ownership rights on the *Software* named **gaussian-splatting**.
+The *Software* is in the process of being registered with the Agence pour la Protection des
+Programmes (APP).
+The *Software* is still being developed by the *Licensor*.
+*Licensor*'s goal is to allow the research community to use, test and evaluate
+the *Software*.
+## 1.  Definitions
+*Licensee* means any person or entity that uses the *Software* and distributes
+its *Work*.
+*Licensor* means the owners of the *Software*, i.e Inria and MPII
+*Software* means the original work of authorship made available under this
+License ie gaussian-splatting.
+*Work* means the *Software* and any additions to or derivative works of the
+*Software* that are made available under this License.
+## 2.  Purpose
+This license is intended to define the rights granted to the *Licensee* by
+Licensors under the *Software*.
+## 3.  Rights granted
+For the above reasons Licensors have decided to distribute the *Software*.
+Licensors grant non-exclusive rights to use the *Software* for research purposes
+to research users (both academic and industrial), free of charge, without right
+to sublicense.. The *Software* may be used "non-commercially", i.e., for research
+and/or evaluation purposes only.
+Subject to the terms and conditions of this License, you are granted a
+non-exclusive, royalty-free, license to reproduce, prepare derivative works of,
+publicly display, publicly perform and distribute its *Work* and any resulting
+derivative works in any form.
+## 4.  Limitations
+**4.1 Redistribution.** You may reproduce or distribute the *Work* only if (a) you do
+so under this License, (b) you include a complete copy of this License with
+your distribution, and (c) you retain without modification any copyright,
+patent, trademark, or attribution notices that are present in the *Work*.
+**4.2 Derivative Works.** You may specify that additional or different terms apply
+to the use, reproduction, and distribution of your derivative works of the *Work*
+("Your Terms") only if (a) Your Terms provide that the use limitation in
+Section 2 applies to your derivative works, and (b) you identify the specific
+derivative works that are subject to Your Terms. Notwithstanding Your Terms,
+this License (including the redistribution requirements in Section 3.1) will
+continue to apply to the *Work* itself.
+**4.3** Any other use without of prior consent of Licensors is prohibited. Research
+users explicitly acknowledge having received from Licensors all information
+allowing to appreciate the adequacy between of the *Software* and their needs and
+to undertake all necessary precautions for its execution and use.
+**4.4** The *Software* is provided both as a compiled library file and as source
+code. In case of using the *Software* for a publication or other results obtained
+through the use of the *Software*, users are strongly encouraged to cite the
+corresponding publications as explained in the documentation of the *Software*.
+## 5.  Disclaimer
+THE USER CANNOT USE, EXPLOIT OR DISTRIBUTE THE *SOFTWARE* FOR COMMERCIAL PURPOSES
+WITHOUT PRIOR AND EXPLICIT CONSENT OF LICENSORS. YOU MUST CONTACT INRIA FOR ANY
+UNAUTHORIZED USE: [email protected] . ANY SUCH ACTION WILL
+CONSTITUTE A FORGERY. THIS *SOFTWARE* IS PROVIDED "AS IS" WITHOUT ANY WARRANTIES
+OF ANY NATURE AND ANY EXPRESS OR IMPLIED WARRANTIES, WITH REGARDS TO COMMERCIAL
+USE, PROFESSIONNAL USE, LEGAL OR NOT, OR OTHER, OR COMMERCIALISATION OR
+ADAPTATION. UNLESS EXPLICITLY PROVIDED BY LAW, IN NO EVENT, SHALL INRIA OR THE
+AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES, LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE *SOFTWARE* OR THE USE OR OTHER DEALINGS IN THE *SOFTWARE*.

README.md DELETED Viewed

@@ -1,13 +0,0 @@
----
-title: Dreamgaussian
-emoji: 🌍
-colorFrom: red
-colorTo: purple
-sdk: gradio
-sdk_version: 3.47.1
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import gradio as gr
+import os
+from PIL import Image
+import subprocess
+# check if there is a picture uploaded or selected
+def check_img_input(control_image):
+    if control_image is None:
+        raise gr.Error("Please select or upload an input image")
+def optimize_stage_1(image_block: Image.Image, preprocess_chk: bool, elevation_slider: float):
+    if not os.path.exists('tmp_data'):
+        os.makedirs('tmp_data')
+    if preprocess_chk:
+        # save image to a designated path
+        image_block.save('tmp_data/tmp.png')
+        # preprocess image
+        subprocess.run([f'python process.py tmp_data/tmp.png'], shell=True)
+    else:
+        image_block.save('tmp_data/tmp_rgba.png')
+    # stage 1
+    subprocess.run([
+                       f'python main.py --config configs/image.yaml input=tmp_data/tmp_rgba.png save_path=tmp mesh_format=glb elevation={elevation_slider} force_cuda_rast=True'],
+                   shell=True)
+    return f'logs/tmp_mesh.glb'
+def optimize_stage_2(elevation_slider: float):
+    # stage 2
+    subprocess.run([
+                       f'python main2.py --config configs/image.yaml input=tmp_data/tmp_rgba.png save_path=tmp mesh_format=glb elevation={elevation_slider} force_cuda_rast=True'],
+                   shell=True)
+    return f'logs/tmp.glb'
+if __name__ == "__main__":
+    _TITLE = '''DreamGaussian: Generative Gaussian Splatting for Efficient 3D Content Creation'''
+    _DESCRIPTION = '''
+    <div>
+    <a style="display:inline-block" href="https://dreamgaussian.github.io"><img src='https://img.shields.io/badge/public_website-8A2BE2'></a>
+    <a style="display:inline-block; margin-left: .5em" href="https://arxiv.org/abs/2309.16653"><img src="https://img.shields.io/badge/2306.16928-f9f7f7?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADcAAABMCAYAAADJPi9EAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAAuIwAALiMBeKU/dgAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAAa2SURBVHja3Zt7bBRFGMAXUCDGF4rY7m7bAwuhlggKStFgLBgFEkCIIRJEEoOBYHwRFYKilUgEReVNJEGCJJpehHI3M9vZvd3bUP1DjNhEIRQQsQgSHiJgQZ5dv7krWEvvdmZ7d7vHJN+ft/f99pv5XvOtJMFCqvoCUpTdIEeRLC+L9Ox5i3Q9LACaCeK0kXoSChVcD3C/tQPHpAEsquQ73IkUcEz2kcLCknyGW5MGjkljRFVL8xJOKyi4CwCOuQAeAkfTP1+tNxLkogvgEbDgffkJqKqvuMA5ifOpqg/5qWecRstNg7xoUTI1Fovdxg8oy2s5AP8CGeYHmGngeZaOL4I4LXLcpHg4149/GDz4xqgsb+UAbMKKUpkrqHA43MUyyJpWUK0EHeG2YKRXr7tB+QMcgGewLD+ebTDbtrtbBt7UPlhS4rV4IvcDI7J8P1OeA/AcAI7LHljN7aB8XTowJmZt9EFRD/o0SDMH4HlwMhMyDWZZSAHFf3YDs3RS49WDLuaAY3IJq+qzmQKLxXAZKN7oDoYbdV3v5elPqiSpMyiOuAEVZVqHXb1OhloUH+MA+ztO0cAO/RkrfyBE7OAEbAZvO8vzVtTRWFD6DAfY5biBM3PWiaL0a4lvXICwnV8WjmE6ntYmhqX2jjp5LbMZjCw/wbYeN6CizOa2GMVzQOlmHjB4Ceuyk6LJ8huccEmR5Xddg7OOV/NAtchW+E3XbOag60QA4Qwuarca0bRuEJyr+cFQwzcY98huxhAKdQelt4kAQpj4qJ3gvFXAYn+aJumXk1yPlpQUgtIHhbYoFMUstNRRWgjnpl4A7IKlayNymqFHFaWCpV9CFry3LGxR1CgA5kB5M8OX2goApwpaz6mdOMGxtAgXWJySxb4WuQD4qTDgU+N5AAnzpr7ChSWpCyisiQJqY0Y7FtmSKpbV23b45kC0KHBxcQ9QeI8w4KgnHRPVtIU7rOtbioLVg5Hl/qDwSVFAMqLSMSObroCdZYlzIJtMRFVHCaRo/wFWPgaAXzdbBpkc2A4aKzCNd97+URQuESYGDDhIVfWOQIKZJu4D2+oXlgDTV1865gUQZDts756BArMNMoR1oa46BYqbyPixZz1ZUFV3sgwoGBajuBKATl3btIn8QYYMuezRgrsiRUWyr2BxA40EkPMpA/Hm6gbUu7fjEXA3azP6AsbKD9bxdUuhjM9W7fII52BF+daRpE4+WA3P501+jbfmHvQKyFqMuXf7Ot4mkN2fr50y+bRH61X7AXdUpHSxaPQ4GVbR5AGw3g+434XgQGKfr72I+vQRhfsu92dOx7WicInzt3CBg1RVpMm0NveWo2SqFzgmdNZMbriILD+S+zoueWf2vSdAipzacWN5nMl6XxNlUHa/J8DoJodUDE0HR8Ll5V0lPxcrLEHZPV4AzS83OLis7FowVa3RSku7BSNxJqQAlN3hBTC2apmDSkpaw22wJemGQFUG7J4MlP3JC6A+f96V7vRyX9It3nzT/GrjIU8edM7rMSnIi10f476lzbE1K7yEiEuWro0OJBguLCwDuFOJc1Na6sRWL/cCeMIwUN9ggSVbe3v/5/EgzTKWLvEAiBrYRUkgwNI2ZaFQNT75UDxEUEx97zYnzpmiLEmbaYCbNxYtFAb0/Z4AztgUrhyxuNgxPnhfHFDHz/vTgFWUQZxTRkkJhQ6YNdVUEPAfO6ZV5BRss6LcCVb7VaAma9giy0XJZBt9IQh42NY0NSdgbLIPlLUF6rEdrdt0CUCK1wsCbkcI3ZSLc7ZSwGLbmJXbPsNxnE5xilYKAobZ77LpGZ8TAIun+/iCKQoF71IxQDI3K2CCd+ARNvXg9sykBcnHAoCZG4u66hlDoQLe6QV4CRtFSxZQ+D0BwNO2jgdkzoGoah1nj3FVlSR19taTSYxI8QLut23U8dsgzqHulJNCQpcqBnpTALCuQ6NSYLHpmR5i42gZzuIdcrMMvMJbQlxe3jXxyZnLACl7ARm/FjPIDOY8ODtpM71sxwfcZpvBeUzKWmfNINM5AS+wO0Khh7dMqKccu4+qatarZjYAwDlgetzStHtEt+XedsBOQtU9XMrRgjg4KTnc5nr+dmqadit/4C4uLm8DuA9koJTj1TL7fI5nDL+qqoo/FLGAzL7dYT17PzvAcQONYSUQRxW/QMrHZVIyik0ZuQA2mzp+Ji8BW4YM3Mbzm9inaHkJCGfrUZZjujiYailfFwA8DHIy3acwUj4v9vUVa+SmgNsl5fuyDTKovW9/IAmfLV0Pi2UncA515kjYdrwC9i9rpuHiq3JwtAAAAABJRU5ErkJggg=="></a>
+    <a style="display:inline-block; margin-left: .5em" href='https://github.com/dreamgaussian/dreamgaussian'><img src='https://img.shields.io/github/stars/dreamgaussian/dreamgaussian?style=social'/></a>
+    </div>
+    We present DreamGausssion, a 3D content generation framework that significantly improves the efficiency of 3D content creation.
+    '''
+    _IMG_USER_GUIDE = "Please upload an image in the block above (or choose an example above) and click **Generate 3D**."
+    # load images in 'data' folder as examples
+    example_folder = os.path.join(os.path.dirname(__file__), 'data')
+    example_fns = os.listdir(example_folder)
+    example_fns.sort()
+    examples_full = [os.path.join(example_folder, x) for x in example_fns if x.endswith('.png')]
+    # Compose demo layout & data flow
+    with gr.Blocks(title=_TITLE, theme=gr.themes.Soft()) as demo:
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown('# ' + _TITLE)
+        gr.Markdown(_DESCRIPTION)
+        # Image-to-3D
+        with gr.Row(variant='panel'):
+            with gr.Column(scale=5):
+                image_block = gr.Image(type='pil', image_mode='RGBA', height=290, label='Input image', tool=None)
+                elevation_slider = gr.Slider(-90, 90, value=0, step=1, label='Estimated elevation angle')
+                gr.Markdown(
+                    "default to 0 (horizontal), range from [-90, 90]. If you upload a look-down image, try a value like -30")
+                preprocess_chk = gr.Checkbox(True,
+                                             label='Preprocess image automatically (remove background and recenter object)')
+                gr.Examples(
+                    examples=examples_full,  # NOTE: elements must match inputs list!
+                    inputs=[image_block],
+                    outputs=[image_block],
+                    cache_examples=False,
+                    label='Examples (click one of the images below to start)',
+                    examples_per_page=40
+                )
+                img_run_btn = gr.Button("Generate 3D")
+                img_guide_text = gr.Markdown(_IMG_USER_GUIDE, visible=True)
+            with gr.Column(scale=5):
+                obj3d_stage1 = gr.Model3D(clear_color=[0.0, 0.0, 0.0, 0.0], label="3D Model (Stage 1)")
+                obj3d = gr.Model3D(clear_color=[0.0, 0.0, 0.0, 0.0], label="3D Model (Final)")
+            # if there is an input image, continue with inference
+            # else display an error message
+            img_run_btn.click(check_img_input, inputs=[image_block], queue=False).success(optimize_stage_1,
+                                                                                          inputs=[image_block,
+                                                                                                  preprocess_chk,
+                                                                                                  elevation_slider],
+                                                                                          outputs=[
+                                                                                              obj3d_stage1]).success(
+                optimize_stage_2, inputs=[elevation_slider], outputs=[obj3d])
+    demo.queue().launch(share=True)

cam_utils.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import numpy as np
+from scipy.spatial.transform import Rotation as R
+import torch
+def dot(x, y):
+    if isinstance(x, np.ndarray):
+        return np.sum(x * y, -1, keepdims=True)
+    else:
+        return torch.sum(x * y, -1, keepdim=True)
+def length(x, eps=1e-20):
+    if isinstance(x, np.ndarray):
+        return np.sqrt(np.maximum(np.sum(x * x, axis=-1, keepdims=True), eps))
+    else:
+        return torch.sqrt(torch.clamp(dot(x, x), min=eps))
+def safe_normalize(x, eps=1e-20):
+    return x / length(x, eps)
+def look_at(campos, target, opengl=True):
+    # campos: [N, 3], camera/eye position
+    # target: [N, 3], object to look at
+    # return: [N, 3, 3], rotation matrix
+    if not opengl:
+        # camera forward aligns with -z
+        forward_vector = safe_normalize(target - campos)
+        up_vector = np.array([0, 1, 0], dtype=np.float32)
+        right_vector = safe_normalize(np.cross(forward_vector, up_vector))
+        up_vector = safe_normalize(np.cross(right_vector, forward_vector))
+    else:
+        # camera forward aligns with +z
+        forward_vector = safe_normalize(campos - target)
+        up_vector = np.array([0, 1, 0], dtype=np.float32)
+        right_vector = safe_normalize(np.cross(up_vector, forward_vector))
+        up_vector = safe_normalize(np.cross(forward_vector, right_vector))
+    R = np.stack([right_vector, up_vector, forward_vector], axis=1)
+    return R
+# elevation & azimuth to pose (cam2world) matrix
+def orbit_camera(elevation, azimuth, radius=1, is_degree=True, target=None, opengl=True):
+    # radius: scalar
+    # elevation: scalar, in (-90, 90), from +y to -y is (-90, 90)
+    # azimuth: scalar, in (-180, 180), from +z to +x is (0, 90)
+    # return: [4, 4], camera pose matrix
+    if is_degree:
+        elevation = np.deg2rad(elevation)
+        azimuth = np.deg2rad(azimuth)
+    x = radius * np.cos(elevation) * np.sin(azimuth)
+    y = - radius * np.sin(elevation)
+    z = radius * np.cos(elevation) * np.cos(azimuth)
+    if target is None:
+        target = np.zeros([3], dtype=np.float32)
+    campos = np.array([x, y, z]) + target  # [3]
+    T = np.eye(4, dtype=np.float32)
+    T[:3, :3] = look_at(campos, target, opengl)
+    T[:3, 3] = campos
+    return T
+class OrbitCamera:
+    def __init__(self, W, H, r=2, fovy=60, near=0.01, far=100):
+        self.W = W
+        self.H = H
+        self.radius = r  # camera distance from center
+        self.fovy = np.deg2rad(fovy)  # deg 2 rad
+        self.near = near
+        self.far = far
+        self.center = np.array([0, 0, 0], dtype=np.float32)  # look at this point
+        self.rot = R.from_matrix(np.eye(3))
+        self.up = np.array([0, 1, 0], dtype=np.float32)  # need to be normalized!
+    @property
+    def fovx(self):
+        return 2 * np.arctan(np.tan(self.fovy / 2) * self.W / self.H)
+    @property
+    def campos(self):
+        return self.pose[:3, 3]
+    # pose (c2w)
+    @property
+    def pose(self):
+        # first move camera to radius
+        res = np.eye(4, dtype=np.float32)
+        res[2, 3] = self.radius  # opengl convention...
+        # rotate
+        rot = np.eye(4, dtype=np.float32)
+        rot[:3, :3] = self.rot.as_matrix()
+        res = rot @ res
+        # translate
+        res[:3, 3] -= self.center
+        return res
+    # view (w2c)
+    @property
+    def view(self):
+        return np.linalg.inv(self.pose)
+    # projection (perspective)
+    @property
+    def perspective(self):
+        y = np.tan(self.fovy / 2)
+        aspect = self.W / self.H
+        return np.array(
+            [
+                [1 / (y * aspect), 0, 0, 0],
+                [0, -1 / y, 0, 0],
+                [
+                    0,
+                    0,
+                    -(self.far + self.near) / (self.far - self.near),
+                    -(2 * self.far * self.near) / (self.far - self.near),
+                ],
+                [0, 0, -1, 0],
+            ],
+            dtype=np.float32,
+        )
+    # intrinsics
+    @property
+    def intrinsics(self):
+        focal = self.H / (2 * np.tan(self.fovy / 2))
+        return np.array([focal, focal, self.W // 2, self.H // 2], dtype=np.float32)
+    @property
+    def mvp(self):
+        return self.perspective @ np.linalg.inv(self.pose)  # [4, 4]
+    def orbit(self, dx, dy):
+        # rotate along camera up/side axis!
+        side = self.rot.as_matrix()[:3, 0]
+        rotvec_x = self.up * np.radians(-0.05 * dx)
+        rotvec_y = side * np.radians(-0.05 * dy)
+        self.rot = R.from_rotvec(rotvec_x) * R.from_rotvec(rotvec_y) * self.rot
+    def scale(self, delta):
+        self.radius *= 1.1 ** (-delta)
+    def pan(self, dx, dy, dz=0):
+        # pan in camera coordinate system (careful on the sensitivity!)
+        self.center += 0.0005 * self.rot.as_matrix()[:3, :3] @ np.array([-dx, -dy, dz])

configs/image.yaml ADDED Viewed

	@@ -0,0 +1,69 @@

+### Input
+# input rgba image path (default to None, can be load in GUI too)
+input:
+# input text prompt (default to None, can be input in GUI too)
+prompt:
+# input mesh for stage 2 (auto-search from stage 1 output path if None)
+mesh:
+# estimated elevation angle for input image
+elevation: 0
+# reference image resolution
+ref_size: 256
+# density thresh for mesh extraction
+density_thresh: 1
+### Output
+outdir: logs
+mesh_format: obj
+save_path: ???
+### Training
+# guidance loss weights (0 to disable)
+lambda_sd: 0
+lambda_zero123: 1
+# training batch size per iter
+batch_size: 1
+# training iterations for stage 1
+iters: 500
+# training iterations for stage 2
+iters_refine: 50
+# training camera radius
+radius: 2
+# training camera fovy
+fovy: 49.1 # align with zero123 rendering setting (ref: https://github.com/cvlab-columbia/zero123/blob/main/objaverse-rendering/scripts/blender_script.py#L61
+# checkpoint to load for stage 1 (should be a ply file)
+load:
+# whether allow geom training in stage 2
+train_geo: False
+# prob to invert background color during training (0 = always black, 1 = always white)
+invert_bg_prob: 0.5
+### GUI
+gui: False
+force_cuda_rast: False
+# GUI resolution
+H: 800
+W: 800
+### Gaussian splatting
+num_pts: 5000
+sh_degree: 0
+position_lr_init: 0.001
+position_lr_final: 0.00002
+position_lr_delay_mult: 0.02
+position_lr_max_steps: 500
+feature_lr: 0.01
+opacity_lr: 0.05
+scaling_lr: 0.005
+rotation_lr: 0.005
+percent_dense: 0.1
+density_start_iter: 100
+density_end_iter: 3000
+densification_interval: 100
+opacity_reset_interval: 700
+densify_grad_threshold: 0.5
+### Textured Mesh
+geom_lr: 0.0001
+texture_lr: 0.2

configs/text.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+### Input
+# input rgba image path (default to None, can be load in GUI too)
+input:
+# input text prompt (default to None, can be input in GUI too)
+prompt:
+# input mesh for stage 2 (auto-search from stage 1 output path if None)
+mesh:
+# estimated elevation angle for input image
+elevation: 0
+# reference image resolution
+ref_size: 256
+# density thresh for mesh extraction
+density_thresh: 1
+### Output
+outdir: logs
+mesh_format: obj
+save_path: ???
+### Training
+# guidance loss weights (0 to disable)
+lambda_sd: 1
+lambda_zero123: 0
+# training batch size per iter
+batch_size: 1
+# training iterations for stage 1
+iters: 500
+# training iterations for stage 2
+iters_refine: 50
+# training camera radius
+radius: 2.5
+# training camera fovy
+fovy: 49.1
+# checkpoint to load for stage 1 (should be a ply file)
+load:
+# whether allow geom training in stage 2
+train_geo: False
+# prob to invert background color during training (0 = always black, 1 = always white)
+invert_bg_prob: 0.5
+### GUI
+gui: False
+force_cuda_rast: False
+# GUI resolution
+H: 800
+W: 800
+### Gaussian splatting
+num_pts: 1000
+sh_degree: 0
+position_lr_init: 0.001
+position_lr_final: 0.00002
+position_lr_delay_mult: 0.02
+position_lr_max_steps: 500
+feature_lr: 0.01
+opacity_lr: 0.05
+scaling_lr: 0.005
+rotation_lr: 0.005
+percent_dense: 0.1
+density_start_iter: 100
+density_end_iter: 3000
+densification_interval: 50
+opacity_reset_interval: 700
+densify_grad_threshold: 0.01
+### Textured Mesh
+geom_lr: 0.0001
+texture_lr: 0.2

data/anya_rgba.png ADDED Viewed

Git LFS Details

SHA256: b8c3e8fe7fb51c4ae7f8b561e3780a50f1f25a9cb8c838d7fce4b38d773473f8
Pointer size: 130 Bytes
Size of remote file: 32.9 kB

data/catstatue_rgba.png ADDED Viewed

Git LFS Details

SHA256: 6a571efb23ff05f92d7363d32a4027c08137d84e9bde863c7dfca5086bd3005d
Pointer size: 130 Bytes
Size of remote file: 45.5 kB

data/csm_luigi_rgba.png ADDED Viewed

Git LFS Details

SHA256: 538fd1c3d1be3f0ef0cbdbf60d3e77821cb304dd68e3fbd62229191d5d050186
Pointer size: 130 Bytes
Size of remote file: 35.4 kB

data/test.png ADDED Viewed

Git LFS Details

SHA256: 479f4fa9a5d2fcbf81240533f347a0d080050162757702317c8d7e06401bb958
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

data/zelda_rgba.png ADDED Viewed

Git LFS Details

SHA256: b5e5004f1c64cbb9aceaf47c3594cfb89dfee64fbdf1a5a10faa5f51e87f0c4f
Pointer size: 130 Bytes
Size of remote file: 44.9 kB

grid_put.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import torch
+import torch.nn.functional as F
+def stride_from_shape(shape):
+    stride = [1]
+    for x in reversed(shape[1:]):
+        stride.append(stride[-1] * x)
+    return list(reversed(stride))
+def scatter_add_nd(input, indices, values):
+    # input: [..., C], D dimension + C channel
+    # indices: [N, D], long
+    # values: [N, C]
+    D = indices.shape[-1]
+    C = input.shape[-1]
+    size = input.shape[:-1]
+    stride = stride_from_shape(size)
+    assert len(size) == D
+    input = input.view(-1, C)  # [HW, C]
+    flatten_indices = (indices * torch.tensor(stride, dtype=torch.long, device=indices.device)).sum(-1)  # [N]
+    input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values)
+    return input.view(*size, C)
+def scatter_add_nd_with_count(input, count, indices, values, weights=None):
+    # input: [..., C], D dimension + C channel
+    # count: [..., 1], D dimension
+    # indices: [N, D], long
+    # values: [N, C]
+    D = indices.shape[-1]
+    C = input.shape[-1]
+    size = input.shape[:-1]
+    stride = stride_from_shape(size)
+    assert len(size) == D
+    input = input.view(-1, C)  # [HW, C]
+    count = count.view(-1, 1)
+    flatten_indices = (indices * torch.tensor(stride, dtype=torch.long, device=indices.device)).sum(-1)  # [N]
+    if weights is None:
+        weights = torch.ones_like(values[..., :1])
+    input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values)
+    count.scatter_add_(0, flatten_indices.unsqueeze(1), weights)
+    return input.view(*size, C), count.view(*size, 1)
+def nearest_grid_put_2d(H, W, coords, values, return_count=False):
+    # coords: [N, 2], float in [-1, 1]
+    # values: [N, C]
+    C = values.shape[-1]
+    indices = (coords * 0.5 + 0.5) * torch.tensor(
+        [H - 1, W - 1], dtype=torch.float32, device=coords.device
+    )
+    indices = indices.round().long()  # [N, 2]
+    result = torch.zeros(H, W, C, device=values.device, dtype=values.dtype)  # [H, W, C]
+    count = torch.zeros(H, W, 1, device=values.device, dtype=values.dtype)  # [H, W, 1]
+    weights = torch.ones_like(values[..., :1])  # [N, 1]
+    result, count = scatter_add_nd_with_count(result, count, indices, values, weights)
+    if return_count:
+        return result, count
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+    return result
+def linear_grid_put_2d(H, W, coords, values, return_count=False):
+    # coords: [N, 2], float in [-1, 1]
+    # values: [N, C]
+    C = values.shape[-1]
+    indices = (coords * 0.5 + 0.5) * torch.tensor(
+        [H - 1, W - 1], dtype=torch.float32, device=coords.device
+    )
+    indices_00 = indices.floor().long()  # [N, 2]
+    indices_00[:, 0].clamp_(0, H - 2)
+    indices_00[:, 1].clamp_(0, W - 2)
+    indices_01 = indices_00 + torch.tensor(
+        [0, 1], dtype=torch.long, device=indices.device
+    )
+    indices_10 = indices_00 + torch.tensor(
+        [1, 0], dtype=torch.long, device=indices.device
+    )
+    indices_11 = indices_00 + torch.tensor(
+        [1, 1], dtype=torch.long, device=indices.device
+    )
+    h = indices[..., 0] - indices_00[..., 0].float()
+    w = indices[..., 1] - indices_00[..., 1].float()
+    w_00 = (1 - h) * (1 - w)
+    w_01 = (1 - h) * w
+    w_10 = h * (1 - w)
+    w_11 = h * w
+    result = torch.zeros(H, W, C, device=values.device, dtype=values.dtype)  # [H, W, C]
+    count = torch.zeros(H, W, 1, device=values.device, dtype=values.dtype)  # [H, W, 1]
+    weights = torch.ones_like(values[..., :1])  # [N, 1]
+    result, count = scatter_add_nd_with_count(result, count, indices_00, values * w_00.unsqueeze(1), weights* w_00.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_01, values * w_01.unsqueeze(1), weights* w_01.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_10, values * w_10.unsqueeze(1), weights* w_10.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_11, values * w_11.unsqueeze(1), weights* w_11.unsqueeze(1))
+    if return_count:
+        return result, count
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+    return result
+def mipmap_linear_grid_put_2d(H, W, coords, values, min_resolution=32, return_count=False):
+    # coords: [N, 2], float in [-1, 1]
+    # values: [N, C]
+    C = values.shape[-1]
+    result = torch.zeros(H, W, C, device=values.device, dtype=values.dtype)  # [H, W, C]
+    count = torch.zeros(H, W, 1, device=values.device, dtype=values.dtype)  # [H, W, 1]
+    cur_H, cur_W = H, W
+    while min(cur_H, cur_W) > min_resolution:
+        # try to fill the holes
+        mask = (count.squeeze(-1) == 0)
+        if not mask.any():
+            break
+        cur_result, cur_count = linear_grid_put_2d(cur_H, cur_W, coords, values, return_count=True)
+        result[mask] = result[mask] + F.interpolate(cur_result.permute(2,0,1).unsqueeze(0).contiguous(), (H, W), mode='bilinear', align_corners=False).squeeze(0).permute(1,2,0).contiguous()[mask]
+        count[mask] = count[mask] + F.interpolate(cur_count.view(1, 1, cur_H, cur_W), (H, W), mode='bilinear', align_corners=False).view(H, W, 1)[mask]
+        cur_H //= 2
+        cur_W //= 2
+    if return_count:
+        return result, count
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+    return result
+def nearest_grid_put_3d(H, W, D, coords, values, return_count=False):
+    # coords: [N, 3], float in [-1, 1]
+    # values: [N, C]
+    C = values.shape[-1]
+    indices = (coords * 0.5 + 0.5) * torch.tensor(
+        [H - 1, W - 1, D - 1], dtype=torch.float32, device=coords.device
+    )
+    indices = indices.round().long()  # [N, 2]
+    result = torch.zeros(H, W, D, C, device=values.device, dtype=values.dtype)  # [H, W, C]
+    count = torch.zeros(H, W, D, 1, device=values.device, dtype=values.dtype)  # [H, W, 1]
+    weights = torch.ones_like(values[..., :1])  # [N, 1]
+    result, count = scatter_add_nd_with_count(result, count, indices, values, weights)
+    if return_count:
+        return result, count
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+    return result
+def linear_grid_put_3d(H, W, D, coords, values, return_count=False):
+    # coords: [N, 3], float in [-1, 1]
+    # values: [N, C]
+    C = values.shape[-1]
+    indices = (coords * 0.5 + 0.5) * torch.tensor(
+        [H - 1, W - 1, D - 1], dtype=torch.float32, device=coords.device
+    )
+    indices_000 = indices.floor().long()  # [N, 3]
+    indices_000[:, 0].clamp_(0, H - 2)
+    indices_000[:, 1].clamp_(0, W - 2)
+    indices_000[:, 2].clamp_(0, D - 2)
+    indices_001 = indices_000 + torch.tensor([0, 0, 1], dtype=torch.long, device=indices.device)
+    indices_010 = indices_000 + torch.tensor([0, 1, 0], dtype=torch.long, device=indices.device)
+    indices_011 = indices_000 + torch.tensor([0, 1, 1], dtype=torch.long, device=indices.device)
+    indices_100 = indices_000 + torch.tensor([1, 0, 0], dtype=torch.long, device=indices.device)
+    indices_101 = indices_000 + torch.tensor([1, 0, 1], dtype=torch.long, device=indices.device)
+    indices_110 = indices_000 + torch.tensor([1, 1, 0], dtype=torch.long, device=indices.device)
+    indices_111 = indices_000 + torch.tensor([1, 1, 1], dtype=torch.long, device=indices.device)
+    h = indices[..., 0] - indices_000[..., 0].float()
+    w = indices[..., 1] - indices_000[..., 1].float()
+    d = indices[..., 2] - indices_000[..., 2].float()
+    w_000 = (1 - h) * (1 - w) * (1 - d)
+    w_001 = (1 - h) * w * (1 - d)
+    w_010 = h * (1 - w) * (1 - d)
+    w_011 = h * w * (1 - d)
+    w_100 = (1 - h) * (1 - w) * d
+    w_101 = (1 - h) * w * d
+    w_110 = h * (1 - w) * d
+    w_111 = h * w * d
+    result = torch.zeros(H, W, D, C, device=values.device, dtype=values.dtype)  # [H, W, D, C]
+    count = torch.zeros(H, W, D, 1, device=values.device, dtype=values.dtype)  # [H, W, D, 1]
+    weights = torch.ones_like(values[..., :1])  # [N, 1]
+    result, count = scatter_add_nd_with_count(result, count, indices_000, values * w_000.unsqueeze(1), weights * w_000.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_001, values * w_001.unsqueeze(1), weights * w_001.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_010, values * w_010.unsqueeze(1), weights * w_010.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_011, values * w_011.unsqueeze(1), weights * w_011.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_100, values * w_100.unsqueeze(1), weights * w_100.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_101, values * w_101.unsqueeze(1), weights * w_101.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_110, values * w_110.unsqueeze(1), weights * w_110.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(result, count, indices_111, values * w_111.unsqueeze(1), weights * w_111.unsqueeze(1))
+    if return_count:
+        return result, count
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+    return result
+def mipmap_linear_grid_put_3d(H, W, D, coords, values, min_resolution=32, return_count=False):
+    # coords: [N, 3], float in [-1, 1]
+    # values: [N, C]
+    C = values.shape[-1]
+    result = torch.zeros(H, W, D, C, device=values.device, dtype=values.dtype)  # [H, W, D, C]
+    count = torch.zeros(H, W, D, 1, device=values.device, dtype=values.dtype)  # [H, W, D, 1]
+    cur_H, cur_W, cur_D = H, W, D
+    while min(min(cur_H, cur_W), cur_D) > min_resolution:
+        # try to fill the holes
+        mask = (count.squeeze(-1) == 0)
+        if not mask.any():
+            break
+        cur_result, cur_count = linear_grid_put_3d(cur_H, cur_W, cur_D, coords, values, return_count=True)
+        result[mask] = result[mask] + F.interpolate(cur_result.permute(3,0,1,2).unsqueeze(0).contiguous(), (H, W, D), mode='trilinear', align_corners=False).squeeze(0).permute(1,2,3,0).contiguous()[mask]
+        count[mask] = count[mask] + F.interpolate(cur_count.view(1, 1, cur_H, cur_W, cur_D), (H, W, D), mode='trilinear', align_corners=False).view(H, W, D, 1)[mask]
+        cur_H //= 2
+        cur_W //= 2
+        cur_D //= 2
+    if return_count:
+        return result, count
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+    return result
+def grid_put(shape, coords, values, mode='linear-mipmap', min_resolution=32, return_raw=False):
+    # shape: [D], list/tuple
+    # coords: [N, D], float in [-1, 1]
+    # values: [N, C]
+    D = len(shape)
+    assert D in [2, 3], f'only support D == 2 or 3, but got D == {D}'
+    if mode == 'nearest':
+        if D == 2:
+            return nearest_grid_put_2d(*shape, coords, values, return_raw)
+        else:
+            return nearest_grid_put_3d(*shape, coords, values, return_raw)
+    elif mode == 'linear':
+        if D == 2:
+            return linear_grid_put_2d(*shape, coords, values, return_raw)
+        else:
+            return linear_grid_put_3d(*shape, coords, values, return_raw)
+    elif mode == 'linear-mipmap':
+        if D == 2:
+            return mipmap_linear_grid_put_2d(*shape, coords, values, min_resolution, return_raw)
+        else:
+            return mipmap_linear_grid_put_3d(*shape, coords, values, min_resolution, return_raw)
+    else:
+        raise NotImplementedError(f"got mode {mode}")

gs_renderer.py ADDED Viewed

	@@ -0,0 +1,820 @@

+import os
+import math
+import numpy as np
+from typing import NamedTuple
+from plyfile import PlyData, PlyElement
+import torch
+from torch import nn
+from diff_gaussian_rasterization import (
+    GaussianRasterizationSettings,
+    GaussianRasterizer,
+)
+from simple_knn._C import distCUDA2
+from sh_utils import eval_sh, SH2RGB, RGB2SH
+from mesh import Mesh
+from mesh_utils import decimate_mesh, clean_mesh
+import kiui
+def inverse_sigmoid(x):
+    return torch.log(x/(1-x))
+def get_expon_lr_func(
+    lr_init, lr_final, lr_delay_steps=0, lr_delay_mult=1.0, max_steps=1000000
+):
+    def helper(step):
+        if lr_init == lr_final:
+            # constant lr, ignore other params
+            return lr_init
+        if step < 0 or (lr_init == 0.0 and lr_final == 0.0):
+            # Disable this parameter
+            return 0.0
+        if lr_delay_steps > 0:
+            # A kind of reverse cosine decay.
+            delay_rate = lr_delay_mult + (1 - lr_delay_mult) * np.sin(
+                0.5 * np.pi * np.clip(step / lr_delay_steps, 0, 1)
+            )
+        else:
+            delay_rate = 1.0
+        t = np.clip(step / max_steps, 0, 1)
+        log_lerp = np.exp(np.log(lr_init) * (1 - t) + np.log(lr_final) * t)
+        return delay_rate * log_lerp
+    return helper
+def strip_lowerdiag(L):
+    uncertainty = torch.zeros((L.shape[0], 6), dtype=torch.float, device="cuda")
+    uncertainty[:, 0] = L[:, 0, 0]
+    uncertainty[:, 1] = L[:, 0, 1]
+    uncertainty[:, 2] = L[:, 0, 2]
+    uncertainty[:, 3] = L[:, 1, 1]
+    uncertainty[:, 4] = L[:, 1, 2]
+    uncertainty[:, 5] = L[:, 2, 2]
+    return uncertainty
+def strip_symmetric(sym):
+    return strip_lowerdiag(sym)
+def gaussian_3d_coeff(xyzs, covs):
+    # xyzs: [N, 3]
+    # covs: [N, 6]
+    x, y, z = xyzs[:, 0], xyzs[:, 1], xyzs[:, 2]
+    a, b, c, d, e, f = covs[:, 0], covs[:, 1], covs[:, 2], covs[:, 3], covs[:, 4], covs[:, 5]
+    # eps must be small enough !!!
+    inv_det = 1 / (a * d * f + 2 * e * c * b - e**2 * a - c**2 * d - b**2 * f + 1e-24)
+    inv_a = (d * f - e**2) * inv_det
+    inv_b = (e * c - b * f) * inv_det
+    inv_c = (e * b - c * d) * inv_det
+    inv_d = (a * f - c**2) * inv_det
+    inv_e = (b * c - e * a) * inv_det
+    inv_f = (a * d - b**2) * inv_det
+    power = -0.5 * (x**2 * inv_a + y**2 * inv_d + z**2 * inv_f) - x * y * inv_b - x * z * inv_c - y * z * inv_e
+    power[power > 0] = -1e10 # abnormal values... make weights 0
+    return torch.exp(power)
+def build_rotation(r):
+    norm = torch.sqrt(r[:,0]*r[:,0] + r[:,1]*r[:,1] + r[:,2]*r[:,2] + r[:,3]*r[:,3])
+    q = r / norm[:, None]
+    R = torch.zeros((q.size(0), 3, 3), device='cuda')
+    r = q[:, 0]
+    x = q[:, 1]
+    y = q[:, 2]
+    z = q[:, 3]
+    R[:, 0, 0] = 1 - 2 * (y*y + z*z)
+    R[:, 0, 1] = 2 * (x*y - r*z)
+    R[:, 0, 2] = 2 * (x*z + r*y)
+    R[:, 1, 0] = 2 * (x*y + r*z)
+    R[:, 1, 1] = 1 - 2 * (x*x + z*z)
+    R[:, 1, 2] = 2 * (y*z - r*x)
+    R[:, 2, 0] = 2 * (x*z - r*y)
+    R[:, 2, 1] = 2 * (y*z + r*x)
+    R[:, 2, 2] = 1 - 2 * (x*x + y*y)
+    return R
+def build_scaling_rotation(s, r):
+    L = torch.zeros((s.shape[0], 3, 3), dtype=torch.float, device="cuda")
+    R = build_rotation(r)
+    L[:,0,0] = s[:,0]
+    L[:,1,1] = s[:,1]
+    L[:,2,2] = s[:,2]
+    L = R @ L
+    return L
+class BasicPointCloud(NamedTuple):
+    points: np.array
+    colors: np.array
+    normals: np.array
+class GaussianModel:
+    def setup_functions(self):
+        def build_covariance_from_scaling_rotation(scaling, scaling_modifier, rotation):
+            L = build_scaling_rotation(scaling_modifier * scaling, rotation)
+            actual_covariance = L @ L.transpose(1, 2)
+            symm = strip_symmetric(actual_covariance)
+            return symm
+        self.scaling_activation = torch.exp
+        self.scaling_inverse_activation = torch.log
+        self.covariance_activation = build_covariance_from_scaling_rotation
+        self.opacity_activation = torch.sigmoid
+        self.inverse_opacity_activation = inverse_sigmoid
+        self.rotation_activation = torch.nn.functional.normalize
+    def __init__(self, sh_degree : int):
+        self.active_sh_degree = 0
+        self.max_sh_degree = sh_degree
+        self._xyz = torch.empty(0)
+        self._features_dc = torch.empty(0)
+        self._features_rest = torch.empty(0)
+        self._scaling = torch.empty(0)
+        self._rotation = torch.empty(0)
+        self._opacity = torch.empty(0)
+        self.max_radii2D = torch.empty(0)
+        self.xyz_gradient_accum = torch.empty(0)
+        self.denom = torch.empty(0)
+        self.optimizer = None
+        self.percent_dense = 0
+        self.spatial_lr_scale = 0
+        self.setup_functions()
+    def capture(self):
+        return (
+            self.active_sh_degree,
+            self._xyz,
+            self._features_dc,
+            self._features_rest,
+            self._scaling,
+            self._rotation,
+            self._opacity,
+            self.max_radii2D,
+            self.xyz_gradient_accum,
+            self.denom,
+            self.optimizer.state_dict(),
+            self.spatial_lr_scale,
+        )
+    def restore(self, model_args, training_args):
+        (self.active_sh_degree,
+        self._xyz,
+        self._features_dc,
+        self._features_rest,
+        self._scaling,
+        self._rotation,
+        self._opacity,
+        self.max_radii2D,
+        xyz_gradient_accum,
+        denom,
+        opt_dict,
+        self.spatial_lr_scale) = model_args
+        self.training_setup(training_args)
+        self.xyz_gradient_accum = xyz_gradient_accum
+        self.denom = denom
+        self.optimizer.load_state_dict(opt_dict)
+    @property
+    def get_scaling(self):
+        return self.scaling_activation(self._scaling)
+    @property
+    def get_rotation(self):
+        return self.rotation_activation(self._rotation)
+    @property
+    def get_xyz(self):
+        return self._xyz
+    @property
+    def get_features(self):
+        features_dc = self._features_dc
+        features_rest = self._features_rest
+        return torch.cat((features_dc, features_rest), dim=1)
+    @property
+    def get_opacity(self):
+        return self.opacity_activation(self._opacity)
+    @torch.no_grad()
+    def extract_fields(self, resolution=128, num_blocks=16, relax_ratio=1.5):
+        # resolution: resolution of field
+        block_size = 2 / num_blocks
+        assert resolution % block_size == 0
+        split_size = resolution // num_blocks
+        opacities = self.get_opacity
+        # pre-filter low opacity gaussians to save computation
+        mask = (opacities > 0.005).squeeze(1)
+        opacities = opacities[mask]
+        xyzs = self.get_xyz[mask]
+        stds = self.get_scaling[mask]
+        # normalize to ~ [-1, 1]
+        mn, mx = xyzs.amin(0), xyzs.amax(0)
+        self.center = (mn + mx) / 2
+        self.scale = 1.8 / (mx - mn).amax().item()
+        xyzs = (xyzs - self.center) * self.scale
+        stds = stds * self.scale
+        covs = self.covariance_activation(stds, 1, self._rotation[mask])
+        # tile
+        device = opacities.device
+        occ = torch.zeros([resolution] * 3, dtype=torch.float32, device=device)
+        X = torch.linspace(-1, 1, resolution).split(split_size)
+        Y = torch.linspace(-1, 1, resolution).split(split_size)
+        Z = torch.linspace(-1, 1, resolution).split(split_size)
+        # loop blocks (assume max size of gaussian is small than relax_ratio * block_size !!!)
+        for xi, xs in enumerate(X):
+            for yi, ys in enumerate(Y):
+                for zi, zs in enumerate(Z):
+                    xx, yy, zz = torch.meshgrid(xs, ys, zs)
+                    # sample points [M, 3]
+                    pts = torch.cat([xx.reshape(-1, 1), yy.reshape(-1, 1), zz.reshape(-1, 1)], dim=-1).to(device)
+                    # in-tile gaussians mask
+                    vmin, vmax = pts.amin(0), pts.amax(0)
+                    vmin -= block_size * relax_ratio
+                    vmax += block_size * relax_ratio
+                    mask = (xyzs < vmax).all(-1) & (xyzs > vmin).all(-1)
+                    # if hit no gaussian, continue to next block
+                    if not mask.any():
+                        continue
+                    mask_xyzs = xyzs[mask] # [L, 3]
+                    mask_covs = covs[mask] # [L, 6]
+                    mask_opas = opacities[mask].view(1, -1) # [L, 1] --> [1, L]
+                    # query per point-gaussian pair.
+                    g_pts = pts.unsqueeze(1).repeat(1, mask_covs.shape[0], 1) - mask_xyzs.unsqueeze(0) # [M, L, 3]
+                    g_covs = mask_covs.unsqueeze(0).repeat(pts.shape[0], 1, 1) # [M, L, 6]
+                    # batch on gaussian to avoid OOM
+                    batch_g = 1024
+                    val = 0
+                    for start in range(0, g_covs.shape[1], batch_g):
+                        end = min(start + batch_g, g_covs.shape[1])
+                        w = gaussian_3d_coeff(g_pts[:, start:end].reshape(-1, 3), g_covs[:, start:end].reshape(-1, 6)).reshape(pts.shape[0], -1) # [M, l]
+                        val += (mask_opas[:, start:end] * w).sum(-1)
+                    # kiui.lo(val, mask_opas, w)
+                    occ[xi * split_size: xi * split_size + len(xs),
+                        yi * split_size: yi * split_size + len(ys),
+                        zi * split_size: zi * split_size + len(zs)] = val.reshape(len(xs), len(ys), len(zs))
+        kiui.lo(occ, verbose=1)
+        return occ
+    def extract_mesh(self, path, density_thresh=1, resolution=128, decimate_target=1e5):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        occ = self.extract_fields(resolution).detach().cpu().numpy()
+        import mcubes
+        vertices, triangles = mcubes.marching_cubes(occ, density_thresh)
+        vertices = vertices / (resolution - 1.0) * 2 - 1
+        # transform back to the original space
+        vertices = vertices / self.scale + self.center.detach().cpu().numpy()
+        vertices, triangles = clean_mesh(vertices, triangles, remesh=True, remesh_size=0.015)
+        if decimate_target > 0 and triangles.shape[0] > decimate_target:
+            vertices, triangles = decimate_mesh(vertices, triangles, decimate_target)
+        v = torch.from_numpy(vertices.astype(np.float32)).contiguous().cuda()
+        f = torch.from_numpy(triangles.astype(np.int32)).contiguous().cuda()
+        print(
+            f"[INFO] marching cubes result: {v.shape} ({v.min().item()}-{v.max().item()}), {f.shape}"
+        )
+        mesh = Mesh(v=v, f=f, device='cuda')
+        return mesh
+    def get_covariance(self, scaling_modifier = 1):
+        return self.covariance_activation(self.get_scaling, scaling_modifier, self._rotation)
+    def oneupSHdegree(self):
+        if self.active_sh_degree < self.max_sh_degree:
+            self.active_sh_degree += 1
+    def create_from_pcd(self, pcd : BasicPointCloud, spatial_lr_scale : float = 1):
+        self.spatial_lr_scale = spatial_lr_scale
+        fused_point_cloud = torch.tensor(np.asarray(pcd.points)).float().cuda()
+        fused_color = RGB2SH(torch.tensor(np.asarray(pcd.colors)).float().cuda())
+        features = torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2)).float().cuda()
+        features[:, :3, 0 ] = fused_color
+        features[:, 3:, 1:] = 0.0
+        print("Number of points at initialisation : ", fused_point_cloud.shape[0])
+        dist2 = torch.clamp_min(distCUDA2(torch.from_numpy(np.asarray(pcd.points)).float().cuda()), 0.0000001)
+        scales = torch.log(torch.sqrt(dist2))[...,None].repeat(1, 3)
+        rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda")
+        rots[:, 0] = 1
+        opacities = inverse_sigmoid(0.1 * torch.ones((fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda"))
+        self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True))
+        self._features_dc = nn.Parameter(features[:,:,0:1].transpose(1, 2).contiguous().requires_grad_(True))
+        self._features_rest = nn.Parameter(features[:,:,1:].transpose(1, 2).contiguous().requires_grad_(True))
+        self._scaling = nn.Parameter(scales.requires_grad_(True))
+        self._rotation = nn.Parameter(rots.requires_grad_(True))
+        self._opacity = nn.Parameter(opacities.requires_grad_(True))
+        self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")
+    def training_setup(self, training_args):
+        self.percent_dense = training_args.percent_dense
+        self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
+        self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
+        l = [
+            {'params': [self._xyz], 'lr': training_args.position_lr_init * self.spatial_lr_scale, "name": "xyz"},
+            {'params': [self._features_dc], 'lr': training_args.feature_lr, "name": "f_dc"},
+            {'params': [self._features_rest], 'lr': training_args.feature_lr / 20.0, "name": "f_rest"},
+            {'params': [self._opacity], 'lr': training_args.opacity_lr, "name": "opacity"},
+            {'params': [self._scaling], 'lr': training_args.scaling_lr, "name": "scaling"},
+            {'params': [self._rotation], 'lr': training_args.rotation_lr, "name": "rotation"}
+        ]
+        self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15)
+        self.xyz_scheduler_args = get_expon_lr_func(lr_init=training_args.position_lr_init*self.spatial_lr_scale,
+                                                    lr_final=training_args.position_lr_final*self.spatial_lr_scale,
+                                                    lr_delay_mult=training_args.position_lr_delay_mult,
+                                                    max_steps=training_args.position_lr_max_steps)
+    def update_learning_rate(self, iteration):
+        ''' Learning rate scheduling per step '''
+        for param_group in self.optimizer.param_groups:
+            if param_group["name"] == "xyz":
+                lr = self.xyz_scheduler_args(iteration)
+                param_group['lr'] = lr
+                return lr
+    def construct_list_of_attributes(self):
+        l = ['x', 'y', 'z', 'nx', 'ny', 'nz']
+        # All channels except the 3 DC
+        for i in range(self._features_dc.shape[1]*self._features_dc.shape[2]):
+            l.append('f_dc_{}'.format(i))
+        for i in range(self._features_rest.shape[1]*self._features_rest.shape[2]):
+            l.append('f_rest_{}'.format(i))
+        l.append('opacity')
+        for i in range(self._scaling.shape[1]):
+            l.append('scale_{}'.format(i))
+        for i in range(self._rotation.shape[1]):
+            l.append('rot_{}'.format(i))
+        return l
+    def save_ply(self, path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        xyz = self._xyz.detach().cpu().numpy()
+        normals = np.zeros_like(xyz)
+        f_dc = self._features_dc.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
+        f_rest = self._features_rest.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
+        opacities = self._opacity.detach().cpu().numpy()
+        scale = self._scaling.detach().cpu().numpy()
+        rotation = self._rotation.detach().cpu().numpy()
+        dtype_full = [(attribute, 'f4') for attribute in self.construct_list_of_attributes()]
+        elements = np.empty(xyz.shape[0], dtype=dtype_full)
+        attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1)
+        elements[:] = list(map(tuple, attributes))
+        el = PlyElement.describe(elements, 'vertex')
+        PlyData([el]).write(path)
+    def reset_opacity(self):
+        opacities_new = inverse_sigmoid(torch.min(self.get_opacity, torch.ones_like(self.get_opacity)*0.01))
+        optimizable_tensors = self.replace_tensor_to_optimizer(opacities_new, "opacity")
+        self._opacity = optimizable_tensors["opacity"]
+    def load_ply(self, path):
+        plydata = PlyData.read(path)
+        xyz = np.stack((np.asarray(plydata.elements[0]["x"]),
+                        np.asarray(plydata.elements[0]["y"]),
+                        np.asarray(plydata.elements[0]["z"])),  axis=1)
+        opacities = np.asarray(plydata.elements[0]["opacity"])[..., np.newaxis]
+        print("Number of points at loading : ", xyz.shape[0])
+        features_dc = np.zeros((xyz.shape[0], 3, 1))
+        features_dc[:, 0, 0] = np.asarray(plydata.elements[0]["f_dc_0"])
+        features_dc[:, 1, 0] = np.asarray(plydata.elements[0]["f_dc_1"])
+        features_dc[:, 2, 0] = np.asarray(plydata.elements[0]["f_dc_2"])
+        extra_f_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("f_rest_")]
+        assert len(extra_f_names)==3*(self.max_sh_degree + 1) ** 2 - 3
+        features_extra = np.zeros((xyz.shape[0], len(extra_f_names)))
+        for idx, attr_name in enumerate(extra_f_names):
+            features_extra[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        # Reshape (P,F*SH_coeffs) to (P, F, SH_coeffs except DC)
+        features_extra = features_extra.reshape((features_extra.shape[0], 3, (self.max_sh_degree + 1) ** 2 - 1))
+        scale_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("scale_")]
+        scales = np.zeros((xyz.shape[0], len(scale_names)))
+        for idx, attr_name in enumerate(scale_names):
+            scales[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        rot_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("rot")]
+        rots = np.zeros((xyz.shape[0], len(rot_names)))
+        for idx, attr_name in enumerate(rot_names):
+            rots[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        self._xyz = nn.Parameter(torch.tensor(xyz, dtype=torch.float, device="cuda").requires_grad_(True))
+        self._features_dc = nn.Parameter(torch.tensor(features_dc, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True))
+        self._features_rest = nn.Parameter(torch.tensor(features_extra, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True))
+        self._opacity = nn.Parameter(torch.tensor(opacities, dtype=torch.float, device="cuda").requires_grad_(True))
+        self._scaling = nn.Parameter(torch.tensor(scales, dtype=torch.float, device="cuda").requires_grad_(True))
+        self._rotation = nn.Parameter(torch.tensor(rots, dtype=torch.float, device="cuda").requires_grad_(True))
+        self.active_sh_degree = self.max_sh_degree
+    def replace_tensor_to_optimizer(self, tensor, name):
+        optimizable_tensors = {}
+        for group in self.optimizer.param_groups:
+            if group["name"] == name:
+                stored_state = self.optimizer.state.get(group['params'][0], None)
+                stored_state["exp_avg"] = torch.zeros_like(tensor)
+                stored_state["exp_avg_sq"] = torch.zeros_like(tensor)
+                del self.optimizer.state[group['params'][0]]
+                group["params"][0] = nn.Parameter(tensor.requires_grad_(True))
+                self.optimizer.state[group['params'][0]] = stored_state
+                optimizable_tensors[group["name"]] = group["params"][0]
+        return optimizable_tensors
+    def _prune_optimizer(self, mask):
+        optimizable_tensors = {}
+        for group in self.optimizer.param_groups:
+            stored_state = self.optimizer.state.get(group['params'][0], None)
+            if stored_state is not None:
+                stored_state["exp_avg"] = stored_state["exp_avg"][mask]
+                stored_state["exp_avg_sq"] = stored_state["exp_avg_sq"][mask]
+                del self.optimizer.state[group['params'][0]]
+                group["params"][0] = nn.Parameter((group["params"][0][mask].requires_grad_(True)))
+                self.optimizer.state[group['params'][0]] = stored_state
+                optimizable_tensors[group["name"]] = group["params"][0]
+            else:
+                group["params"][0] = nn.Parameter(group["params"][0][mask].requires_grad_(True))
+                optimizable_tensors[group["name"]] = group["params"][0]
+        return optimizable_tensors
+    def prune_points(self, mask):
+        valid_points_mask = ~mask
+        optimizable_tensors = self._prune_optimizer(valid_points_mask)
+        self._xyz = optimizable_tensors["xyz"]
+        self._features_dc = optimizable_tensors["f_dc"]
+        self._features_rest = optimizable_tensors["f_rest"]
+        self._opacity = optimizable_tensors["opacity"]
+        self._scaling = optimizable_tensors["scaling"]
+        self._rotation = optimizable_tensors["rotation"]
+        self.xyz_gradient_accum = self.xyz_gradient_accum[valid_points_mask]
+        self.denom = self.denom[valid_points_mask]
+        self.max_radii2D = self.max_radii2D[valid_points_mask]
+    def cat_tensors_to_optimizer(self, tensors_dict):
+        optimizable_tensors = {}
+        for group in self.optimizer.param_groups:
+            assert len(group["params"]) == 1
+            extension_tensor = tensors_dict[group["name"]]
+            stored_state = self.optimizer.state.get(group['params'][0], None)
+            if stored_state is not None:
+                stored_state["exp_avg"] = torch.cat((stored_state["exp_avg"], torch.zeros_like(extension_tensor)), dim=0)
+                stored_state["exp_avg_sq"] = torch.cat((stored_state["exp_avg_sq"], torch.zeros_like(extension_tensor)), dim=0)
+                del self.optimizer.state[group['params'][0]]
+                group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True))
+                self.optimizer.state[group['params'][0]] = stored_state
+                optimizable_tensors[group["name"]] = group["params"][0]
+            else:
+                group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True))
+                optimizable_tensors[group["name"]] = group["params"][0]
+        return optimizable_tensors
+    def densification_postfix(self, new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation):
+        d = {"xyz": new_xyz,
+        "f_dc": new_features_dc,
+        "f_rest": new_features_rest,
+        "opacity": new_opacities,
+        "scaling" : new_scaling,
+        "rotation" : new_rotation}
+        optimizable_tensors = self.cat_tensors_to_optimizer(d)
+        self._xyz = optimizable_tensors["xyz"]
+        self._features_dc = optimizable_tensors["f_dc"]
+        self._features_rest = optimizable_tensors["f_rest"]
+        self._opacity = optimizable_tensors["opacity"]
+        self._scaling = optimizable_tensors["scaling"]
+        self._rotation = optimizable_tensors["rotation"]
+        self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
+        self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
+        self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")
+    def densify_and_split(self, grads, grad_threshold, scene_extent, N=2):
+        n_init_points = self.get_xyz.shape[0]
+        # Extract points that satisfy the gradient condition
+        padded_grad = torch.zeros((n_init_points), device="cuda")
+        padded_grad[:grads.shape[0]] = grads.squeeze()
+        selected_pts_mask = torch.where(padded_grad >= grad_threshold, True, False)
+        selected_pts_mask = torch.logical_and(selected_pts_mask,
+                                              torch.max(self.get_scaling, dim=1).values > self.percent_dense*scene_extent)
+        stds = self.get_scaling[selected_pts_mask].repeat(N,1)
+        means =torch.zeros((stds.size(0), 3),device="cuda")
+        samples = torch.normal(mean=means, std=stds)
+        rots = build_rotation(self._rotation[selected_pts_mask]).repeat(N,1,1)
+        new_xyz = torch.bmm(rots, samples.unsqueeze(-1)).squeeze(-1) + self.get_xyz[selected_pts_mask].repeat(N, 1)
+        new_scaling = self.scaling_inverse_activation(self.get_scaling[selected_pts_mask].repeat(N,1) / (0.8*N))
+        new_rotation = self._rotation[selected_pts_mask].repeat(N,1)
+        new_features_dc = self._features_dc[selected_pts_mask].repeat(N,1,1)
+        new_features_rest = self._features_rest[selected_pts_mask].repeat(N,1,1)
+        new_opacity = self._opacity[selected_pts_mask].repeat(N,1)
+        self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacity, new_scaling, new_rotation)
+        prune_filter = torch.cat((selected_pts_mask, torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool)))
+        self.prune_points(prune_filter)
+    def densify_and_clone(self, grads, grad_threshold, scene_extent):
+        # Extract points that satisfy the gradient condition
+        selected_pts_mask = torch.where(torch.norm(grads, dim=-1) >= grad_threshold, True, False)
+        selected_pts_mask = torch.logical_and(selected_pts_mask,
+                                              torch.max(self.get_scaling, dim=1).values <= self.percent_dense*scene_extent)
+        new_xyz = self._xyz[selected_pts_mask]
+        new_features_dc = self._features_dc[selected_pts_mask]
+        new_features_rest = self._features_rest[selected_pts_mask]
+        new_opacities = self._opacity[selected_pts_mask]
+        new_scaling = self._scaling[selected_pts_mask]
+        new_rotation = self._rotation[selected_pts_mask]
+        self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation)
+    def densify_and_prune(self, max_grad, min_opacity, extent, max_screen_size):
+        grads = self.xyz_gradient_accum / self.denom
+        grads[grads.isnan()] = 0.0
+        self.densify_and_clone(grads, max_grad, extent)
+        self.densify_and_split(grads, max_grad, extent)
+        prune_mask = (self.get_opacity < min_opacity).squeeze()
+        if max_screen_size:
+            big_points_vs = self.max_radii2D > max_screen_size
+            big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent
+            prune_mask = torch.logical_or(torch.logical_or(prune_mask, big_points_vs), big_points_ws)
+        self.prune_points(prune_mask)
+        torch.cuda.empty_cache()
+    def prune(self, min_opacity, extent, max_screen_size):
+        prune_mask = (self.get_opacity < min_opacity).squeeze()
+        if max_screen_size:
+            big_points_vs = self.max_radii2D > max_screen_size
+            big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent
+            prune_mask = torch.logical_or(torch.logical_or(prune_mask, big_points_vs), big_points_ws)
+        self.prune_points(prune_mask)
+        torch.cuda.empty_cache()
+    def add_densification_stats(self, viewspace_point_tensor, update_filter):
+        self.xyz_gradient_accum[update_filter] += torch.norm(viewspace_point_tensor.grad[update_filter,:2], dim=-1, keepdim=True)
+        self.denom[update_filter] += 1
+def getProjectionMatrix(znear, zfar, fovX, fovY):
+    tanHalfFovY = math.tan((fovY / 2))
+    tanHalfFovX = math.tan((fovX / 2))
+    P = torch.zeros(4, 4)
+    z_sign = 1.0
+    P[0, 0] = 1 / tanHalfFovX
+    P[1, 1] = 1 / tanHalfFovY
+    P[3, 2] = z_sign
+    P[2, 2] = z_sign * zfar / (zfar - znear)
+    P[2, 3] = -(zfar * znear) / (zfar - znear)
+    return P
+class MiniCam:
+    def __init__(self, c2w, width, height, fovy, fovx, znear, zfar):
+        # c2w (pose) should be in NeRF convention.
+        self.image_width = width
+        self.image_height = height
+        self.FoVy = fovy
+        self.FoVx = fovx
+        self.znear = znear
+        self.zfar = zfar
+        w2c = np.linalg.inv(c2w)
+        # rectify...
+        w2c[1:3, :3] *= -1
+        w2c[:3, 3] *= -1
+        self.world_view_transform = torch.tensor(w2c).transpose(0, 1).cuda()
+        self.projection_matrix = (
+            getProjectionMatrix(
+                znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy
+            )
+            .transpose(0, 1)
+            .cuda()
+        )
+        self.full_proj_transform = self.world_view_transform @ self.projection_matrix
+        self.camera_center = -torch.tensor(c2w[:3, 3]).cuda()
+class Renderer:
+    def __init__(self, sh_degree=3, white_background=True, radius=1):
+        self.sh_degree = sh_degree
+        self.white_background = white_background
+        self.radius = radius
+        self.gaussians = GaussianModel(sh_degree)
+        self.bg_color = torch.tensor(
+            [1, 1, 1] if white_background else [0, 0, 0],
+            dtype=torch.float32,
+            device="cuda",
+        )
+    def initialize(self, input=None, num_pts=5000, radius=0.5):
+        # load checkpoint
+        if input is None:
+            # init from random point cloud
+            phis = np.random.random((num_pts,)) * 2 * np.pi
+            costheta = np.random.random((num_pts,)) * 2 - 1
+            thetas = np.arccos(costheta)
+            mu = np.random.random((num_pts,))
+            radius = radius * np.cbrt(mu)
+            x = radius * np.sin(thetas) * np.cos(phis)
+            y = radius * np.sin(thetas) * np.sin(phis)
+            z = radius * np.cos(thetas)
+            xyz = np.stack((x, y, z), axis=1)
+            # xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3
+            shs = np.random.random((num_pts, 3)) / 255.0
+            pcd = BasicPointCloud(
+                points=xyz, colors=SH2RGB(shs), normals=np.zeros((num_pts, 3))
+            )
+            self.gaussians.create_from_pcd(pcd, 10)
+        elif isinstance(input, BasicPointCloud):
+            # load from a provided pcd
+            self.gaussians.create_from_pcd(input, 1)
+        else:
+            # load from saved ply
+            self.gaussians.load_ply(input)
+    def render(
+        self,
+        viewpoint_camera,
+        scaling_modifier=1.0,
+        invert_bg_color=False,
+        override_color=None,
+        compute_cov3D_python=False,
+        convert_SHs_python=False,
+    ):
+        # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
+        screenspace_points = (
+            torch.zeros_like(
+                self.gaussians.get_xyz,
+                dtype=self.gaussians.get_xyz.dtype,
+                requires_grad=True,
+                device="cuda",
+            )
+            + 0
+        )
+        try:
+            screenspace_points.retain_grad()
+        except:
+            pass
+        # Set up rasterization configuration
+        tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
+        tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
+        raster_settings = GaussianRasterizationSettings(
+            image_height=int(viewpoint_camera.image_height),
+            image_width=int(viewpoint_camera.image_width),
+            tanfovx=tanfovx,
+            tanfovy=tanfovy,
+            bg=self.bg_color if not invert_bg_color else 1 - self.bg_color,
+            scale_modifier=scaling_modifier,
+            viewmatrix=viewpoint_camera.world_view_transform,
+            projmatrix=viewpoint_camera.full_proj_transform,
+            sh_degree=self.gaussians.active_sh_degree,
+            campos=viewpoint_camera.camera_center,
+            prefiltered=False,
+            debug=False,
+        )
+        rasterizer = GaussianRasterizer(raster_settings=raster_settings)
+        means3D = self.gaussians.get_xyz
+        means2D = screenspace_points
+        opacity = self.gaussians.get_opacity
+        # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
+        # scaling / rotation by the rasterizer.
+        scales = None
+        rotations = None
+        cov3D_precomp = None
+        if compute_cov3D_python:
+            cov3D_precomp = self.gaussians.get_covariance(scaling_modifier)
+        else:
+            scales = self.gaussians.get_scaling
+            rotations = self.gaussians.get_rotation
+        # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
+        # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
+        shs = None
+        colors_precomp = None
+        if colors_precomp is None:
+            if convert_SHs_python:
+                shs_view = self.gaussians.get_features.transpose(1, 2).view(
+                    -1, 3, (self.gaussians.max_sh_degree + 1) ** 2
+                )
+                dir_pp = self.gaussians.get_xyz - viewpoint_camera.camera_center.repeat(
+                    self.gaussians.get_features.shape[0], 1
+                )
+                dir_pp_normalized = dir_pp / dir_pp.norm(dim=1, keepdim=True)
+                sh2rgb = eval_sh(
+                    self.gaussians.active_sh_degree, shs_view, dir_pp_normalized
+                )
+                colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0)
+            else:
+                shs = self.gaussians.get_features
+        else:
+            colors_precomp = override_color
+        # Rasterize visible Gaussians to image, obtain their radii (on screen).
+        rendered_image, radii, rendered_depth, rendered_alpha = rasterizer(
+            means3D=means3D,
+            means2D=means2D,
+            shs=shs,
+            colors_precomp=colors_precomp,
+            opacities=opacity,
+            scales=scales,
+            rotations=rotations,
+            cov3D_precomp=cov3D_precomp,
+        )
+        rendered_image = rendered_image.clamp(0, 1)
+        # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
+        # They will be excluded from value updates used in the splitting criteria.
+        return {
+            "image": rendered_image,
+            "depth": rendered_depth,
+            "alpha": rendered_alpha,
+            "viewspace_points": screenspace_points,
+            "visibility_filter": radii > 0,
+            "radii": radii,
+        }

guidance/sd_utils.py ADDED Viewed

	@@ -0,0 +1,334 @@

+from transformers import CLIPTextModel, CLIPTokenizer, logging
+from diffusers import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+    PNDMScheduler,
+    DDIMScheduler,
+    StableDiffusionPipeline,
+)
+from diffusers.utils.import_utils import is_xformers_available
+# suppress partial model loading warning
+logging.set_verbosity_error()
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def seed_everything(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    # torch.backends.cudnn.deterministic = True
+    # torch.backends.cudnn.benchmark = True
+class StableDiffusion(nn.Module):
+    def __init__(
+        self,
+        device,
+        fp16=True,
+        vram_O=False,
+        sd_version="2.1",
+        hf_key=None,
+        t_range=[0.02, 0.98],
+    ):
+        super().__init__()
+        self.device = device
+        self.sd_version = sd_version
+        if hf_key is not None:
+            print(f"[INFO] using hugging face custom model key: {hf_key}")
+            model_key = hf_key
+        elif self.sd_version == "2.1":
+            model_key = "stabilityai/stable-diffusion-2-1-base"
+        elif self.sd_version == "2.0":
+            model_key = "stabilityai/stable-diffusion-2-base"
+        elif self.sd_version == "1.5":
+            model_key = "runwayml/stable-diffusion-v1-5"
+        else:
+            raise ValueError(
+                f"Stable-diffusion version {self.sd_version} not supported."
+            )
+        self.dtype = torch.float16 if fp16 else torch.float32
+        # Create model
+        pipe = StableDiffusionPipeline.from_pretrained(
+            model_key, torch_dtype=self.dtype
+        )
+        if vram_O:
+            pipe.enable_sequential_cpu_offload()
+            pipe.enable_vae_slicing()
+            pipe.unet.to(memory_format=torch.channels_last)
+            pipe.enable_attention_slicing(1)
+            # pipe.enable_model_cpu_offload()
+        else:
+            pipe.to(device)
+        self.vae = pipe.vae
+        self.tokenizer = pipe.tokenizer
+        self.text_encoder = pipe.text_encoder
+        self.unet = pipe.unet
+        self.scheduler = DDIMScheduler.from_pretrained(
+            model_key, subfolder="scheduler", torch_dtype=self.dtype
+        )
+        del pipe
+        self.num_train_timesteps = self.scheduler.config.num_train_timesteps
+        self.min_step = int(self.num_train_timesteps * t_range[0])
+        self.max_step = int(self.num_train_timesteps * t_range[1])
+        self.alphas = self.scheduler.alphas_cumprod.to(self.device)  # for convenience
+        self.embeddings = None
+    @torch.no_grad()
+    def get_text_embeds(self, prompts, negative_prompts):
+        pos_embeds = self.encode_text(prompts)  # [1, 77, 768]
+        neg_embeds = self.encode_text(negative_prompts)
+        self.embeddings = torch.cat([neg_embeds, pos_embeds], dim=0)  # [2, 77, 768]
+    def encode_text(self, prompt):
+        # prompt: [str]
+        inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        embeddings = self.text_encoder(inputs.input_ids.to(self.device))[0]
+        return embeddings
+    @torch.no_grad()
+    def refine(self, pred_rgb,
+               guidance_scale=100, steps=50, strength=0.8,
+        ):
+        batch_size = pred_rgb.shape[0]
+        pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode='bilinear', align_corners=False)
+        latents = self.encode_imgs(pred_rgb_512.to(self.dtype))
+        # latents = torch.randn((1, 4, 64, 64), device=self.device, dtype=self.dtype)
+        self.scheduler.set_timesteps(steps)
+        init_step = int(steps * strength)
+        latents = self.scheduler.add_noise(latents, torch.randn_like(latents), self.scheduler.timesteps[init_step])
+        for i, t in enumerate(self.scheduler.timesteps[init_step:]):
+            latent_model_input = torch.cat([latents] * 2)
+            noise_pred = self.unet(
+                latent_model_input, t, encoder_hidden_states=self.embeddings,
+            ).sample
+            noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+        imgs = self.decode_latents(latents) # [1, 3, 512, 512]
+        return imgs
+    def train_step(
+        self,
+        pred_rgb,
+        step_ratio=None,
+        guidance_scale=100,
+        as_latent=False,
+    ):
+        batch_size = pred_rgb.shape[0]
+        pred_rgb = pred_rgb.to(self.dtype)
+        if as_latent:
+            latents = F.interpolate(pred_rgb, (64, 64), mode="bilinear", align_corners=False) * 2 - 1
+        else:
+            # interp to 512x512 to be fed into vae.
+            pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode="bilinear", align_corners=False)
+            # encode image into latents with vae, requires grad!
+            latents = self.encode_imgs(pred_rgb_512)
+        if step_ratio is not None:
+            # dreamtime-like
+            # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio)
+            t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step)
+            t = torch.full((batch_size,), t, dtype=torch.long, device=self.device)
+        else:
+            t = torch.randint(self.min_step, self.max_step + 1, (batch_size,), dtype=torch.long, device=self.device)
+        # w(t), sigma_t^2
+        w = (1 - self.alphas[t]).view(batch_size, 1, 1, 1)
+        # predict the noise residual with unet, NO grad!
+        with torch.no_grad():
+            # add noise
+            noise = torch.randn_like(latents)
+            latents_noisy = self.scheduler.add_noise(latents, noise, t)
+            # pred noise
+            latent_model_input = torch.cat([latents_noisy] * 2)
+            tt = torch.cat([t] * 2)
+            noise_pred = self.unet(
+                latent_model_input, tt, encoder_hidden_states=self.embeddings.repeat(batch_size, 1, 1)
+            ).sample
+            # perform guidance (high scale from paper!)
+            noise_pred_uncond, noise_pred_pos = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_pos - noise_pred_uncond
+            )
+        grad = w * (noise_pred - noise)
+        grad = torch.nan_to_num(grad)
+        # seems important to avoid NaN...
+        # grad = grad.clamp(-1, 1)
+        target = (latents - grad).detach()
+        loss = 0.5 * F.mse_loss(latents.float(), target, reduction='sum') / latents.shape[0]
+        return loss
+    @torch.no_grad()
+    def produce_latents(
+        self,
+        height=512,
+        width=512,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        latents=None,
+    ):
+        if latents is None:
+            latents = torch.randn(
+                (
+                    self.embeddings.shape[0] // 2,
+                    self.unet.in_channels,
+                    height // 8,
+                    width // 8,
+                ),
+                device=self.device,
+            )
+        self.scheduler.set_timesteps(num_inference_steps)
+        for i, t in enumerate(self.scheduler.timesteps):
+            # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+            latent_model_input = torch.cat([latents] * 2)
+            # predict the noise residual
+            noise_pred = self.unet(
+                latent_model_input, t, encoder_hidden_states=self.embeddings
+            ).sample
+            # perform guidance
+            noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_cond - noise_pred_uncond
+            )
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+        return latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        imgs = self.vae.decode(latents).sample
+        imgs = (imgs / 2 + 0.5).clamp(0, 1)
+        return imgs
+    def encode_imgs(self, imgs):
+        # imgs: [B, 3, H, W]
+        imgs = 2 * imgs - 1
+        posterior = self.vae.encode(imgs).latent_dist
+        latents = posterior.sample() * self.vae.config.scaling_factor
+        return latents
+    def prompt_to_img(
+        self,
+        prompts,
+        negative_prompts="",
+        height=512,
+        width=512,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        latents=None,
+    ):
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        if isinstance(negative_prompts, str):
+            negative_prompts = [negative_prompts]
+        # Prompts -> text embeds
+        self.get_text_embeds(prompts, negative_prompts)
+        # Text embeds -> img latents
+        latents = self.produce_latents(
+            height=height,
+            width=width,
+            latents=latents,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+        )  # [1, 4, 64, 64]
+        # Img latents -> imgs
+        imgs = self.decode_latents(latents)  # [1, 3, 512, 512]
+        # Img to Numpy
+        imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy()
+        imgs = (imgs * 255).round().astype("uint8")
+        return imgs
+if __name__ == "__main__":
+    import argparse
+    import matplotlib.pyplot as plt
+    parser = argparse.ArgumentParser()
+    parser.add_argument("prompt", type=str)
+    parser.add_argument("--negative", default="", type=str)
+    parser.add_argument(
+        "--sd_version",
+        type=str,
+        default="2.1",
+        choices=["1.5", "2.0", "2.1"],
+        help="stable diffusion version",
+    )
+    parser.add_argument(
+        "--hf_key",
+        type=str,
+        default=None,
+        help="hugging face Stable diffusion model key",
+    )
+    parser.add_argument("--fp16", action="store_true", help="use float16 for training")
+    parser.add_argument(
+        "--vram_O", action="store_true", help="optimization for low VRAM usage"
+    )
+    parser.add_argument("-H", type=int, default=512)
+    parser.add_argument("-W", type=int, default=512)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--steps", type=int, default=50)
+    opt = parser.parse_args()
+    seed_everything(opt.seed)
+    device = torch.device("cuda")
+    sd = StableDiffusion(device, opt.fp16, opt.vram_O, opt.sd_version, opt.hf_key)
+    imgs = sd.prompt_to_img(opt.prompt, opt.negative, opt.H, opt.W, opt.steps)
+    # visualize image
+    plt.imshow(imgs[0])
+    plt.show()

guidance/zero123_utils.py ADDED Viewed

	@@ -0,0 +1,226 @@

+from transformers import CLIPTextModel, CLIPTokenizer, logging
+from diffusers import (
+    AutoencoderKL,
+    UNet2DConditionModel,
+    DDIMScheduler,
+    StableDiffusionPipeline,
+)
+import torchvision.transforms.functional as TF
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import sys
+sys.path.append('./')
+from zero123 import Zero123Pipeline
+class Zero123(nn.Module):
+    def __init__(self, device, fp16=True, t_range=[0.02, 0.98]):
+        super().__init__()
+        self.device = device
+        self.fp16 = fp16
+        self.dtype = torch.float16 if fp16 else torch.float32
+        self.pipe = Zero123Pipeline.from_pretrained(
+            # "bennyguo/zero123-diffusers",
+            "bennyguo/zero123-xl-diffusers",
+            # './model_cache/zero123_xl',
+            variant="fp16_ema" if self.fp16 else None,
+            torch_dtype=self.dtype,
+        ).to(self.device)
+        # for param in self.pipe.parameters():
+        #     param.requires_grad = False
+        self.pipe.image_encoder.eval()
+        self.pipe.vae.eval()
+        self.pipe.unet.eval()
+        self.pipe.clip_camera_projection.eval()
+        self.vae = self.pipe.vae
+        self.unet = self.pipe.unet
+        self.pipe.set_progress_bar_config(disable=True)
+        self.scheduler = DDIMScheduler.from_config(self.pipe.scheduler.config)
+        self.num_train_timesteps = self.scheduler.config.num_train_timesteps
+        self.min_step = int(self.num_train_timesteps * t_range[0])
+        self.max_step = int(self.num_train_timesteps * t_range[1])
+        self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience
+        self.embeddings = None
+    @torch.no_grad()
+    def get_img_embeds(self, x):
+        # x: image tensor in [0, 1]
+        x = F.interpolate(x, (256, 256), mode='bilinear', align_corners=False)
+        x_pil = [TF.to_pil_image(image) for image in x]
+        x_clip = self.pipe.feature_extractor(images=x_pil, return_tensors="pt").pixel_values.to(device=self.device, dtype=self.dtype)
+        c = self.pipe.image_encoder(x_clip).image_embeds
+        v = self.encode_imgs(x.to(self.dtype)) / self.vae.config.scaling_factor
+        self.embeddings = [c, v]
+    @torch.no_grad()
+    def refine(self, pred_rgb, polar, azimuth, radius,
+               guidance_scale=5, steps=50, strength=0.8,
+        ):
+        batch_size = pred_rgb.shape[0]
+        self.scheduler.set_timesteps(steps)
+        if strength == 0:
+            init_step = 0
+            latents = torch.randn((1, 4, 32, 32), device=self.device, dtype=self.dtype)
+        else:
+            init_step = int(steps * strength)
+            pred_rgb_256 = F.interpolate(pred_rgb, (256, 256), mode='bilinear', align_corners=False)
+            latents = self.encode_imgs(pred_rgb_256.to(self.dtype))
+            latents = self.scheduler.add_noise(latents, torch.randn_like(latents), self.scheduler.timesteps[init_step])
+        T = np.stack([np.deg2rad(polar), np.sin(np.deg2rad(azimuth)), np.cos(np.deg2rad(azimuth)), radius], axis=-1)
+        T = torch.from_numpy(T).unsqueeze(1).to(self.dtype).to(self.device) # [8, 1, 4]
+        cc_emb = torch.cat([self.embeddings[0].repeat(batch_size, 1, 1), T], dim=-1)
+        cc_emb = self.pipe.clip_camera_projection(cc_emb)
+        cc_emb = torch.cat([cc_emb, torch.zeros_like(cc_emb)], dim=0)
+        vae_emb = self.embeddings[1].repeat(batch_size, 1, 1, 1)
+        vae_emb = torch.cat([vae_emb, torch.zeros_like(vae_emb)], dim=0)
+        for i, t in enumerate(self.scheduler.timesteps[init_step:]):
+            x_in = torch.cat([latents] * 2)
+            t_in = torch.cat([t.view(1)] * 2).to(self.device)
+            noise_pred = self.unet(
+                torch.cat([x_in, vae_emb], dim=1),
+                t_in.to(self.unet.dtype),
+                encoder_hidden_states=cc_emb,
+            ).sample
+            noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+        imgs = self.decode_latents(latents) # [1, 3, 256, 256]
+        return imgs
+    def train_step(self, pred_rgb, polar, azimuth, radius, step_ratio=None, guidance_scale=5, as_latent=False):
+        # pred_rgb: tensor [1, 3, H, W] in [0, 1]
+        batch_size = pred_rgb.shape[0]
+        if as_latent:
+            latents = F.interpolate(pred_rgb, (32, 32), mode='bilinear', align_corners=False) * 2 - 1
+        else:
+            pred_rgb_256 = F.interpolate(pred_rgb, (256, 256), mode='bilinear', align_corners=False)
+            latents = self.encode_imgs(pred_rgb_256.to(self.dtype))
+        if step_ratio is not None:
+            # dreamtime-like
+            # t = self.max_step - (self.max_step - self.min_step) * np.sqrt(step_ratio)
+            t = np.round((1 - step_ratio) * self.num_train_timesteps).clip(self.min_step, self.max_step)
+            t = torch.full((batch_size,), t, dtype=torch.long, device=self.device)
+        else:
+            t = torch.randint(self.min_step, self.max_step + 1, (batch_size,), dtype=torch.long, device=self.device)
+        w = (1 - self.alphas[t]).view(batch_size, 1, 1, 1)
+        with torch.no_grad():
+            noise = torch.randn_like(latents)
+            latents_noisy = self.scheduler.add_noise(latents, noise, t)
+            x_in = torch.cat([latents_noisy] * 2)
+            t_in = torch.cat([t] * 2)
+            T = np.stack([np.deg2rad(polar), np.sin(np.deg2rad(azimuth)), np.cos(np.deg2rad(azimuth)), radius], axis=-1)
+            T = torch.from_numpy(T).unsqueeze(1).to(self.dtype).to(self.device) # [8, 1, 4]
+            cc_emb = torch.cat([self.embeddings[0].repeat(batch_size, 1, 1), T], dim=-1)
+            cc_emb = self.pipe.clip_camera_projection(cc_emb)
+            cc_emb = torch.cat([cc_emb, torch.zeros_like(cc_emb)], dim=0)
+            vae_emb = self.embeddings[1].repeat(batch_size, 1, 1, 1)
+            vae_emb = torch.cat([vae_emb, torch.zeros_like(vae_emb)], dim=0)
+            noise_pred = self.unet(
+                torch.cat([x_in, vae_emb], dim=1),
+                t_in.to(self.unet.dtype),
+                encoder_hidden_states=cc_emb,
+            ).sample
+        noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+        grad = w * (noise_pred - noise)
+        grad = torch.nan_to_num(grad)
+        target = (latents - grad).detach()
+        loss = 0.5 * F.mse_loss(latents.float(), target, reduction='sum')
+        return loss
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        imgs = self.vae.decode(latents).sample
+        imgs = (imgs / 2 + 0.5).clamp(0, 1)
+        return imgs
+    def encode_imgs(self, imgs, mode=False):
+        # imgs: [B, 3, H, W]
+        imgs = 2 * imgs - 1
+        posterior = self.vae.encode(imgs).latent_dist
+        if mode:
+            latents = posterior.mode()
+        else:
+            latents = posterior.sample()
+        latents = latents * self.vae.config.scaling_factor
+        return latents
+if __name__ == '__main__':
+    import cv2
+    import argparse
+    import numpy as np
+    import matplotlib.pyplot as plt
+    parser = argparse.ArgumentParser()
+    parser.add_argument('input', type=str)
+    parser.add_argument('--polar', type=float, default=0, help='delta polar angle in [-90, 90]')
+    parser.add_argument('--azimuth', type=float, default=0, help='delta azimuth angle in [-180, 180]')
+    parser.add_argument('--radius', type=float, default=0, help='delta camera radius multiplier in [-0.5, 0.5]')
+    opt = parser.parse_args()
+    device = torch.device('cuda')
+    print(f'[INFO] loading image from {opt.input} ...')
+    image = cv2.imread(opt.input, cv2.IMREAD_UNCHANGED)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image = cv2.resize(image, (256, 256), interpolation=cv2.INTER_AREA)
+    image = image.astype(np.float32) / 255.0
+    image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).contiguous().to(device)
+    print(f'[INFO] loading model ...')
+    zero123 = Zero123(device)
+    print(f'[INFO] running model ...')
+    zero123.get_img_embeds(image)
+    while True:
+        outputs = zero123.refine(image, polar=[opt.polar], azimuth=[opt.azimuth], radius=[opt.radius], strength=0)
+        plt.imshow(outputs.float().cpu().numpy().transpose(0, 2, 3, 1)[0])
+        plt.show()

main.py ADDED Viewed

	@@ -0,0 +1,882 @@

+import os
+import cv2
+import time
+import tqdm
+import numpy as np
+import dearpygui.dearpygui as dpg
+import torch
+import torch.nn.functional as F
+import rembg
+from cam_utils import orbit_camera, OrbitCamera
+from gs_renderer import Renderer, MiniCam
+from grid_put import mipmap_linear_grid_put_2d
+from mesh import Mesh, safe_normalize
+class GUI:
+    def __init__(self, opt):
+        self.opt = opt  # shared with the trainer's opt to support in-place modification of rendering parameters.
+        self.gui = opt.gui # enable gui
+        self.W = opt.W
+        self.H = opt.H
+        self.cam = OrbitCamera(opt.W, opt.H, r=opt.radius, fovy=opt.fovy)
+        self.mode = "image"
+        self.seed = "random"
+        self.buffer_image = np.ones((self.W, self.H, 3), dtype=np.float32)
+        self.need_update = True  # update buffer_image
+        # models
+        self.device = torch.device("cuda")
+        self.bg_remover = None
+        self.guidance_sd = None
+        self.guidance_zero123 = None
+        self.enable_sd = False
+        self.enable_zero123 = False
+        # renderer
+        self.renderer = Renderer(sh_degree=self.opt.sh_degree)
+        self.gaussain_scale_factor = 1
+        # input image
+        self.input_img = None
+        self.input_mask = None
+        self.input_img_torch = None
+        self.input_mask_torch = None
+        self.overlay_input_img = False
+        self.overlay_input_img_ratio = 0.5
+        # input text
+        self.prompt = ""
+        self.negative_prompt = ""
+        # training stuff
+        self.training = False
+        self.optimizer = None
+        self.step = 0
+        self.train_steps = 1  # steps per rendering loop
+        # load input data from cmdline
+        if self.opt.input is not None:
+            self.load_input(self.opt.input)
+        # override prompt from cmdline
+        if self.opt.prompt is not None:
+            self.prompt = self.opt.prompt
+        # override if provide a checkpoint
+        if self.opt.load is not None:
+            self.renderer.initialize(self.opt.load)
+        else:
+            # initialize gaussians to a blob
+            self.renderer.initialize(num_pts=self.opt.num_pts)
+        if self.gui:
+            dpg.create_context()
+            self.register_dpg()
+            self.test_step()
+    def __del__(self):
+        if self.gui:
+            dpg.destroy_context()
+    def seed_everything(self):
+        try:
+            seed = int(self.seed)
+        except:
+            seed = np.random.randint(0, 1000000)
+        os.environ["PYTHONHASHSEED"] = str(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = True
+        self.last_seed = seed
+    def prepare_train(self):
+        self.step = 0
+        # setup training
+        self.renderer.gaussians.training_setup(self.opt)
+        # do not do progressive sh-level
+        self.renderer.gaussians.active_sh_degree = self.renderer.gaussians.max_sh_degree
+        self.optimizer = self.renderer.gaussians.optimizer
+        # default camera
+        pose = orbit_camera(self.opt.elevation, 0, self.opt.radius)
+        self.fixed_cam = MiniCam(
+            pose,
+            self.opt.ref_size,
+            self.opt.ref_size,
+            self.cam.fovy,
+            self.cam.fovx,
+            self.cam.near,
+            self.cam.far,
+        )
+        self.enable_sd = self.opt.lambda_sd > 0 and self.prompt != ""
+        self.enable_zero123 = self.opt.lambda_zero123 > 0 and self.input_img is not None
+        # lazy load guidance model
+        if self.guidance_sd is None and self.enable_sd:
+            print(f"[INFO] loading SD...")
+            from guidance.sd_utils import StableDiffusion
+            self.guidance_sd = StableDiffusion(self.device)
+            print(f"[INFO] loaded SD!")
+        if self.guidance_zero123 is None and self.enable_zero123:
+            print(f"[INFO] loading zero123...")
+            from guidance.zero123_utils import Zero123
+            self.guidance_zero123 = Zero123(self.device)
+            print(f"[INFO] loaded zero123!")
+        # input image
+        if self.input_img is not None:
+            self.input_img_torch = torch.from_numpy(self.input_img).permute(2, 0, 1).unsqueeze(0).to(self.device)
+            self.input_img_torch = F.interpolate(self.input_img_torch, (self.opt.ref_size, self.opt.ref_size), mode="bilinear", align_corners=False)
+            self.input_mask_torch = torch.from_numpy(self.input_mask).permute(2, 0, 1).unsqueeze(0).to(self.device)
+            self.input_mask_torch = F.interpolate(self.input_mask_torch, (self.opt.ref_size, self.opt.ref_size), mode="bilinear", align_corners=False)
+        # prepare embeddings
+        with torch.no_grad():
+            if self.enable_sd:
+                self.guidance_sd.get_text_embeds([self.prompt], [self.negative_prompt])
+            if self.enable_zero123:
+                self.guidance_zero123.get_img_embeds(self.input_img_torch)
+    def train_step(self):
+        starter = torch.cuda.Event(enable_timing=True)
+        ender = torch.cuda.Event(enable_timing=True)
+        starter.record()
+        for _ in range(self.train_steps):
+            self.step += 1
+            step_ratio = min(1, self.step / self.opt.iters)
+            # update lr
+            self.renderer.gaussians.update_learning_rate(self.step)
+            loss = 0
+            ### known view
+            if self.input_img_torch is not None:
+                cur_cam = self.fixed_cam
+                out = self.renderer.render(cur_cam)
+                # rgb loss
+                image = out["image"].unsqueeze(0) # [1, 3, H, W] in [0, 1]
+                loss = loss + 10000 * step_ratio * F.mse_loss(image, self.input_img_torch)
+                # mask loss
+                mask = out["alpha"].unsqueeze(0) # [1, 1, H, W] in [0, 1]
+                loss = loss + 1000 * step_ratio * F.mse_loss(mask, self.input_mask_torch)
+            ### novel view (manual batch)
+            render_resolution = 128 if step_ratio < 0.3 else (256 if step_ratio < 0.6 else 512)
+            images = []
+            vers, hors, radii = [], [], []
+            # avoid too large elevation (> 80 or < -80), and make sure it always cover [-30, 30]
+            min_ver = max(min(-30, -30 - self.opt.elevation), -80 - self.opt.elevation)
+            max_ver = min(max(30, 30 - self.opt.elevation), 80 - self.opt.elevation)
+            for _ in range(self.opt.batch_size):
+                # render random view
+                ver = np.random.randint(min_ver, max_ver)
+                hor = np.random.randint(-180, 180)
+                radius = 0
+                vers.append(ver)
+                hors.append(hor)
+                radii.append(radius)
+                pose = orbit_camera(self.opt.elevation + ver, hor, self.opt.radius + radius)
+                cur_cam = MiniCam(
+                    pose,
+                    render_resolution,
+                    render_resolution,
+                    self.cam.fovy,
+                    self.cam.fovx,
+                    self.cam.near,
+                    self.cam.far,
+                )
+                invert_bg_color = np.random.rand() > self.opt.invert_bg_prob
+                out = self.renderer.render(cur_cam, invert_bg_color=invert_bg_color)
+                image = out["image"].unsqueeze(0)# [1, 3, H, W] in [0, 1]
+                images.append(image)
+            images = torch.cat(images, dim=0)
+            # import kiui
+            # kiui.lo(hor, ver)
+            # kiui.vis.plot_image(image)
+            # guidance loss
+            if self.enable_sd:
+                loss = loss + self.opt.lambda_sd * self.guidance_sd.train_step(images, step_ratio)
+            if self.enable_zero123:
+                loss = loss + self.opt.lambda_zero123 * self.guidance_zero123.train_step(images, vers, hors, radii, step_ratio)
+            # optimize step
+            loss.backward()
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+            # densify and prune
+            if self.step >= self.opt.density_start_iter and self.step <= self.opt.density_end_iter:
+                viewspace_point_tensor, visibility_filter, radii = out["viewspace_points"], out["visibility_filter"], out["radii"]
+                self.renderer.gaussians.max_radii2D[visibility_filter] = torch.max(self.renderer.gaussians.max_radii2D[visibility_filter], radii[visibility_filter])
+                self.renderer.gaussians.add_densification_stats(viewspace_point_tensor, visibility_filter)
+                if self.step % self.opt.densification_interval == 0:
+                    # size_threshold = 20 if self.step > self.opt.opacity_reset_interval else None
+                    self.renderer.gaussians.densify_and_prune(self.opt.densify_grad_threshold, min_opacity=0.01, extent=0.5, max_screen_size=1)
+                if self.step % self.opt.opacity_reset_interval == 0:
+                    self.renderer.gaussians.reset_opacity()
+        ender.record()
+        torch.cuda.synchronize()
+        t = starter.elapsed_time(ender)
+        self.need_update = True
+        if self.gui:
+            dpg.set_value("_log_train_time", f"{t:.4f}ms")
+            dpg.set_value(
+                "_log_train_log",
+                f"step = {self.step: 5d} (+{self.train_steps: 2d}) loss = {loss.item():.4f}",
+            )
+        # dynamic train steps (no need for now)
+        # max allowed train time per-frame is 500 ms
+        # full_t = t / self.train_steps * 16
+        # train_steps = min(16, max(4, int(16 * 500 / full_t)))
+        # if train_steps > self.train_steps * 1.2 or train_steps < self.train_steps * 0.8:
+        #     self.train_steps = train_steps
+    @torch.no_grad()
+    def test_step(self):
+        # ignore if no need to update
+        if not self.need_update:
+            return
+        starter = torch.cuda.Event(enable_timing=True)
+        ender = torch.cuda.Event(enable_timing=True)
+        starter.record()
+        # should update image
+        if self.need_update:
+            # render image
+            cur_cam = MiniCam(
+                self.cam.pose,
+                self.W,
+                self.H,
+                self.cam.fovy,
+                self.cam.fovx,
+                self.cam.near,
+                self.cam.far,
+            )
+            out = self.renderer.render(cur_cam, self.gaussain_scale_factor)
+            buffer_image = out[self.mode]  # [3, H, W]
+            if self.mode in ['depth', 'alpha']:
+                buffer_image = buffer_image.repeat(3, 1, 1)
+                if self.mode == 'depth':
+                    buffer_image = (buffer_image - buffer_image.min()) / (buffer_image.max() - buffer_image.min() + 1e-20)
+            buffer_image = F.interpolate(
+                buffer_image.unsqueeze(0),
+                size=(self.H, self.W),
+                mode="bilinear",
+                align_corners=False,
+            ).squeeze(0)
+            self.buffer_image = (
+                buffer_image.permute(1, 2, 0)
+                .contiguous()
+                .clamp(0, 1)
+                .contiguous()
+                .detach()
+                .cpu()
+                .numpy()
+            )
+            # display input_image
+            if self.overlay_input_img and self.input_img is not None:
+                self.buffer_image = (
+                    self.buffer_image * (1 - self.overlay_input_img_ratio)
+                    + self.input_img * self.overlay_input_img_ratio
+                )
+            self.need_update = False
+        ender.record()
+        torch.cuda.synchronize()
+        t = starter.elapsed_time(ender)
+        if self.gui:
+            dpg.set_value("_log_infer_time", f"{t:.4f}ms ({int(1000/t)} FPS)")
+            dpg.set_value(
+                "_texture", self.buffer_image
+            )  # buffer must be contiguous, else seg fault!
+    def load_input(self, file):
+        # load image
+        print(f'[INFO] load image from {file}...')
+        img = cv2.imread(file, cv2.IMREAD_UNCHANGED)
+        if img.shape[-1] == 3:
+            if self.bg_remover is None:
+                self.bg_remover = rembg.new_session()
+            img = rembg.remove(img, session=self.bg_remover)
+        img = cv2.resize(img, (self.W, self.H), interpolation=cv2.INTER_AREA)
+        img = img.astype(np.float32) / 255.0
+        self.input_mask = img[..., 3:]
+        # white bg
+        self.input_img = img[..., :3] * self.input_mask + (1 - self.input_mask)
+        # bgr to rgb
+        self.input_img = self.input_img[..., ::-1].copy()
+        # load prompt
+        file_prompt = file.replace("_rgba.png", "_caption.txt")
+        if os.path.exists(file_prompt):
+            print(f'[INFO] load prompt from {file_prompt}...')
+            with open(file_prompt, "r") as f:
+                self.prompt = f.read().strip()
+    @torch.no_grad()
+    def save_model(self, mode='geo', texture_size=1024):
+        os.makedirs(self.opt.outdir, exist_ok=True)
+        if mode == 'geo':
+            path = os.path.join(self.opt.outdir, self.opt.save_path + '_mesh.ply')
+            mesh = self.renderer.gaussians.extract_mesh(path, self.opt.density_thresh)
+            mesh.write_ply(path)
+        elif mode == 'geo+tex':
+            path = os.path.join(self.opt.outdir, self.opt.save_path + '_mesh.' + self.opt.mesh_format)
+            mesh = self.renderer.gaussians.extract_mesh(path, self.opt.density_thresh)
+            # perform texture extraction
+            print(f"[INFO] unwrap uv...")
+            h = w = texture_size
+            mesh.auto_uv()
+            mesh.auto_normal()
+            albedo = torch.zeros((h, w, 3), device=self.device, dtype=torch.float32)
+            cnt = torch.zeros((h, w, 1), device=self.device, dtype=torch.float32)
+            # self.prepare_train() # tmp fix for not loading 0123
+            # vers = [0]
+            # hors = [0]
+            vers = [0] * 8 + [-45] * 8 + [45] * 8 + [-89.9, 89.9]
+            hors = [0, 45, -45, 90, -90, 135, -135, 180] * 3 + [0, 0]
+            render_resolution = 512
+            import nvdiffrast.torch as dr
+            if not self.opt.force_cuda_rast and (not self.opt.gui or os.name == 'nt'):
+                glctx = dr.RasterizeGLContext()
+            else:
+                glctx = dr.RasterizeCudaContext()
+            for ver, hor in zip(vers, hors):
+                # render image
+                pose = orbit_camera(ver, hor, self.cam.radius)
+                cur_cam = MiniCam(
+                    pose,
+                    render_resolution,
+                    render_resolution,
+                    self.cam.fovy,
+                    self.cam.fovx,
+                    self.cam.near,
+                    self.cam.far,
+                )
+                cur_out = self.renderer.render(cur_cam)
+                rgbs = cur_out["image"].unsqueeze(0) # [1, 3, H, W] in [0, 1]
+                # enhance texture quality with zero123 [not working well]
+                # if self.opt.guidance_model == 'zero123':
+                #     rgbs = self.guidance.refine(rgbs, [ver], [hor], [0])
+                    # import kiui
+                    # kiui.vis.plot_image(rgbs)
+                # get coordinate in texture image
+                pose = torch.from_numpy(pose.astype(np.float32)).to(self.device)
+                proj = torch.from_numpy(self.cam.perspective.astype(np.float32)).to(self.device)
+                v_cam = torch.matmul(F.pad(mesh.v, pad=(0, 1), mode='constant', value=1.0), torch.inverse(pose).T).float().unsqueeze(0)
+                v_clip = v_cam @ proj.T
+                rast, rast_db = dr.rasterize(glctx, v_clip, mesh.f, (render_resolution, render_resolution))
+                depth, _ = dr.interpolate(-v_cam[..., [2]], rast, mesh.f) # [1, H, W, 1]
+                depth = depth.squeeze(0) # [H, W, 1]
+                alpha = (rast[0, ..., 3:] > 0).float()
+                uvs, _ = dr.interpolate(mesh.vt.unsqueeze(0), rast, mesh.ft)  # [1, 512, 512, 2] in [0, 1]
+                # use normal to produce a back-project mask
+                normal, _ = dr.interpolate(mesh.vn.unsqueeze(0).contiguous(), rast, mesh.fn)
+                normal = safe_normalize(normal[0])
+                # rotated normal (where [0, 0, 1] always faces camera)
+                rot_normal = normal @ pose[:3, :3]
+                viewcos = rot_normal[..., [2]]
+                mask = (alpha > 0) & (viewcos > 0.5)  # [H, W, 1]
+                mask = mask.view(-1)
+                uvs = uvs.view(-1, 2).clamp(0, 1)[mask]
+                rgbs = rgbs.view(3, -1).permute(1, 0)[mask].contiguous()
+                # update texture image
+                cur_albedo, cur_cnt = mipmap_linear_grid_put_2d(
+                    h, w,
+                    uvs[..., [1, 0]] * 2 - 1,
+                    rgbs,
+                    min_resolution=256,
+                    return_count=True,
+                )
+                # albedo += cur_albedo
+                # cnt += cur_cnt
+                mask = cnt.squeeze(-1) < 0.1
+                albedo[mask] += cur_albedo[mask]
+                cnt[mask] += cur_cnt[mask]
+            mask = cnt.squeeze(-1) > 0
+            albedo[mask] = albedo[mask] / cnt[mask].repeat(1, 3)
+            mask = mask.view(h, w)
+            albedo = albedo.detach().cpu().numpy()
+            mask = mask.detach().cpu().numpy()
+            # dilate texture
+            from sklearn.neighbors import NearestNeighbors
+            from scipy.ndimage import binary_dilation, binary_erosion
+            inpaint_region = binary_dilation(mask, iterations=32)
+            inpaint_region[mask] = 0
+            search_region = mask.copy()
+            not_search_region = binary_erosion(search_region, iterations=3)
+            search_region[not_search_region] = 0
+            search_coords = np.stack(np.nonzero(search_region), axis=-1)
+            inpaint_coords = np.stack(np.nonzero(inpaint_region), axis=-1)
+            knn = NearestNeighbors(n_neighbors=1, algorithm="kd_tree").fit(
+                search_coords
+            )
+            _, indices = knn.kneighbors(inpaint_coords)
+            albedo[tuple(inpaint_coords.T)] = albedo[tuple(search_coords[indices[:, 0]].T)]
+            mesh.albedo = torch.from_numpy(albedo).to(self.device)
+            mesh.write(path)
+        else:
+            path = os.path.join(self.opt.outdir, self.opt.save_path + '_model.ply')
+            self.renderer.gaussians.save_ply(path)
+        print(f"[INFO] save model to {path}.")
+    def register_dpg(self):
+        ### register texture
+        with dpg.texture_registry(show=False):
+            dpg.add_raw_texture(
+                self.W,
+                self.H,
+                self.buffer_image,
+                format=dpg.mvFormat_Float_rgb,
+                tag="_texture",
+            )
+        ### register window
+        # the rendered image, as the primary window
+        with dpg.window(
+            tag="_primary_window",
+            width=self.W,
+            height=self.H,
+            pos=[0, 0],
+            no_move=True,
+            no_title_bar=True,
+            no_scrollbar=True,
+        ):
+            # add the texture
+            dpg.add_image("_texture")
+        # dpg.set_primary_window("_primary_window", True)
+        # control window
+        with dpg.window(
+            label="Control",
+            tag="_control_window",
+            width=600,
+            height=self.H,
+            pos=[self.W, 0],
+            no_move=True,
+            no_title_bar=True,
+        ):
+            # button theme
+            with dpg.theme() as theme_button:
+                with dpg.theme_component(dpg.mvButton):
+                    dpg.add_theme_color(dpg.mvThemeCol_Button, (23, 3, 18))
+                    dpg.add_theme_color(dpg.mvThemeCol_ButtonHovered, (51, 3, 47))
+                    dpg.add_theme_color(dpg.mvThemeCol_ButtonActive, (83, 18, 83))
+                    dpg.add_theme_style(dpg.mvStyleVar_FrameRounding, 5)
+                    dpg.add_theme_style(dpg.mvStyleVar_FramePadding, 3, 3)
+            # timer stuff
+            with dpg.group(horizontal=True):
+                dpg.add_text("Infer time: ")
+                dpg.add_text("no data", tag="_log_infer_time")
+            def callback_setattr(sender, app_data, user_data):
+                setattr(self, user_data, app_data)
+            # init stuff
+            with dpg.collapsing_header(label="Initialize", default_open=True):
+                # seed stuff
+                def callback_set_seed(sender, app_data):
+                    self.seed = app_data
+                    self.seed_everything()
+                dpg.add_input_text(
+                    label="seed",
+                    default_value=self.seed,
+                    on_enter=True,
+                    callback=callback_set_seed,
+                )
+                # input stuff
+                def callback_select_input(sender, app_data):
+                    # only one item
+                    for k, v in app_data["selections"].items():
+                        dpg.set_value("_log_input", k)
+                        self.load_input(v)
+                    self.need_update = True
+                with dpg.file_dialog(
+                    directory_selector=False,
+                    show=False,
+                    callback=callback_select_input,
+                    file_count=1,
+                    tag="file_dialog_tag",
+                    width=700,
+                    height=400,
+                ):
+                    dpg.add_file_extension("Images{.jpg,.jpeg,.png}")
+                with dpg.group(horizontal=True):
+                    dpg.add_button(
+                        label="input",
+                        callback=lambda: dpg.show_item("file_dialog_tag"),
+                    )
+                    dpg.add_text("", tag="_log_input")
+                # overlay stuff
+                with dpg.group(horizontal=True):
+                    def callback_toggle_overlay_input_img(sender, app_data):
+                        self.overlay_input_img = not self.overlay_input_img
+                        self.need_update = True
+                    dpg.add_checkbox(
+                        label="overlay image",
+                        default_value=self.overlay_input_img,
+                        callback=callback_toggle_overlay_input_img,
+                    )
+                    def callback_set_overlay_input_img_ratio(sender, app_data):
+                        self.overlay_input_img_ratio = app_data
+                        self.need_update = True
+                    dpg.add_slider_float(
+                        label="ratio",
+                        min_value=0,
+                        max_value=1,
+                        format="%.1f",
+                        default_value=self.overlay_input_img_ratio,
+                        callback=callback_set_overlay_input_img_ratio,
+                    )
+                # prompt stuff
+                dpg.add_input_text(
+                    label="prompt",
+                    default_value=self.prompt,
+                    callback=callback_setattr,
+                    user_data="prompt",
+                )
+                dpg.add_input_text(
+                    label="negative",
+                    default_value=self.negative_prompt,
+                    callback=callback_setattr,
+                    user_data="negative_prompt",
+                )
+                # save current model
+                with dpg.group(horizontal=True):
+                    dpg.add_text("Save: ")
+                    def callback_save(sender, app_data, user_data):
+                        self.save_model(mode=user_data)
+                    dpg.add_button(
+                        label="model",
+                        tag="_button_save_model",
+                        callback=callback_save,
+                        user_data='model',
+                    )
+                    dpg.bind_item_theme("_button_save_model", theme_button)
+                    dpg.add_button(
+                        label="geo",
+                        tag="_button_save_mesh",
+                        callback=callback_save,
+                        user_data='geo',
+                    )
+                    dpg.bind_item_theme("_button_save_mesh", theme_button)
+                    dpg.add_button(
+                        label="geo+tex",
+                        tag="_button_save_mesh_with_tex",
+                        callback=callback_save,
+                        user_data='geo+tex',
+                    )
+                    dpg.bind_item_theme("_button_save_mesh_with_tex", theme_button)
+                    dpg.add_input_text(
+                        label="",
+                        default_value=self.opt.save_path,
+                        callback=callback_setattr,
+                        user_data="save_path",
+                    )
+            # training stuff
+            with dpg.collapsing_header(label="Train", default_open=True):
+                # lr and train button
+                with dpg.group(horizontal=True):
+                    dpg.add_text("Train: ")
+                    def callback_train(sender, app_data):
+                        if self.training:
+                            self.training = False
+                            dpg.configure_item("_button_train", label="start")
+                        else:
+                            self.prepare_train()
+                            self.training = True
+                            dpg.configure_item("_button_train", label="stop")
+                    # dpg.add_button(
+                    #     label="init", tag="_button_init", callback=self.prepare_train
+                    # )
+                    # dpg.bind_item_theme("_button_init", theme_button)
+                    dpg.add_button(
+                        label="start", tag="_button_train", callback=callback_train
+                    )
+                    dpg.bind_item_theme("_button_train", theme_button)
+                with dpg.group(horizontal=True):
+                    dpg.add_text("", tag="_log_train_time")
+                    dpg.add_text("", tag="_log_train_log")
+            # rendering options
+            with dpg.collapsing_header(label="Rendering", default_open=True):
+                # mode combo
+                def callback_change_mode(sender, app_data):
+                    self.mode = app_data
+                    self.need_update = True
+                dpg.add_combo(
+                    ("image", "depth", "alpha"),
+                    label="mode",
+                    default_value=self.mode,
+                    callback=callback_change_mode,
+                )
+                # fov slider
+                def callback_set_fovy(sender, app_data):
+                    self.cam.fovy = np.deg2rad(app_data)
+                    self.need_update = True
+                dpg.add_slider_int(
+                    label="FoV (vertical)",
+                    min_value=1,
+                    max_value=120,
+                    format="%d deg",
+                    default_value=np.rad2deg(self.cam.fovy),
+                    callback=callback_set_fovy,
+                )
+                def callback_set_gaussain_scale(sender, app_data):
+                    self.gaussain_scale_factor = app_data
+                    self.need_update = True
+                dpg.add_slider_float(
+                    label="gaussain scale",
+                    min_value=0,
+                    max_value=1,
+                    format="%.2f",
+                    default_value=self.gaussain_scale_factor,
+                    callback=callback_set_gaussain_scale,
+                )
+        ### register camera handler
+        def callback_camera_drag_rotate_or_draw_mask(sender, app_data):
+            if not dpg.is_item_focused("_primary_window"):
+                return
+            dx = app_data[1]
+            dy = app_data[2]
+            self.cam.orbit(dx, dy)
+            self.need_update = True
+        def callback_camera_wheel_scale(sender, app_data):
+            if not dpg.is_item_focused("_primary_window"):
+                return
+            delta = app_data
+            self.cam.scale(delta)
+            self.need_update = True
+        def callback_camera_drag_pan(sender, app_data):
+            if not dpg.is_item_focused("_primary_window"):
+                return
+            dx = app_data[1]
+            dy = app_data[2]
+            self.cam.pan(dx, dy)
+            self.need_update = True
+        def callback_set_mouse_loc(sender, app_data):
+            if not dpg.is_item_focused("_primary_window"):
+                return
+            # just the pixel coordinate in image
+            self.mouse_loc = np.array(app_data)
+        with dpg.handler_registry():
+            # for camera moving
+            dpg.add_mouse_drag_handler(
+                button=dpg.mvMouseButton_Left,
+                callback=callback_camera_drag_rotate_or_draw_mask,
+            )
+            dpg.add_mouse_wheel_handler(callback=callback_camera_wheel_scale)
+            dpg.add_mouse_drag_handler(
+                button=dpg.mvMouseButton_Middle, callback=callback_camera_drag_pan
+            )
+        dpg.create_viewport(
+            title="Gaussian3D",
+            width=self.W + 600,
+            height=self.H + (45 if os.name == "nt" else 0),
+            resizable=False,
+        )
+        ### global theme
+        with dpg.theme() as theme_no_padding:
+            with dpg.theme_component(dpg.mvAll):
+                # set all padding to 0 to avoid scroll bar
+                dpg.add_theme_style(
+                    dpg.mvStyleVar_WindowPadding, 0, 0, category=dpg.mvThemeCat_Core
+                )
+                dpg.add_theme_style(
+                    dpg.mvStyleVar_FramePadding, 0, 0, category=dpg.mvThemeCat_Core
+                )
+                dpg.add_theme_style(
+                    dpg.mvStyleVar_CellPadding, 0, 0, category=dpg.mvThemeCat_Core
+                )
+        dpg.bind_item_theme("_primary_window", theme_no_padding)
+        dpg.setup_dearpygui()
+        ### register a larger font
+        # get it from: https://github.com/lxgw/LxgwWenKai/releases/download/v1.300/LXGWWenKai-Regular.ttf
+        if os.path.exists("LXGWWenKai-Regular.ttf"):
+            with dpg.font_registry():
+                with dpg.font("LXGWWenKai-Regular.ttf", 18) as default_font:
+                    dpg.bind_font(default_font)
+        # dpg.show_metrics()
+        dpg.show_viewport()
+    def render(self):
+        assert self.gui
+        while dpg.is_dearpygui_running():
+            # update texture every frame
+            if self.training:
+                self.train_step()
+            self.test_step()
+            dpg.render_dearpygui_frame()
+    # no gui mode
+    def train(self, iters=500):
+        if iters > 0:
+            self.prepare_train()
+            for i in tqdm.trange(iters):
+                self.train_step()
+            # do a last prune
+            self.renderer.gaussians.prune(min_opacity=0.01, extent=1, max_screen_size=1)
+        # save
+        self.save_model(mode='model')
+        self.save_model(mode='geo+tex')
+if __name__ == "__main__":
+    import argparse
+    from omegaconf import OmegaConf
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True, help="path to the yaml config file")
+    args, extras = parser.parse_known_args()
+    # override default config from cli
+    opt = OmegaConf.merge(OmegaConf.load(args.config), OmegaConf.from_cli(extras))
+    gui = GUI(opt)
+    if opt.gui:
+        gui.render()
+    else:
+        gui.train(opt.iters)

main2.py ADDED Viewed

	@@ -0,0 +1,671 @@

+import os
+import cv2
+import time
+import tqdm
+import numpy as np
+import dearpygui.dearpygui as dpg
+import torch
+import torch.nn.functional as F
+import trimesh
+import rembg
+from cam_utils import orbit_camera, OrbitCamera
+from mesh_renderer import Renderer
+# from kiui.lpips import LPIPS
+class GUI:
+    def __init__(self, opt):
+        self.opt = opt  # shared with the trainer's opt to support in-place modification of rendering parameters.
+        self.gui = opt.gui # enable gui
+        self.W = opt.W
+        self.H = opt.H
+        self.cam = OrbitCamera(opt.W, opt.H, r=opt.radius, fovy=opt.fovy)
+        self.mode = "image"
+        self.seed = "random"
+        self.buffer_image = np.ones((self.W, self.H, 3), dtype=np.float32)
+        self.need_update = True  # update buffer_image
+        # models
+        self.device = torch.device("cuda")
+        self.bg_remover = None
+        self.guidance_sd = None
+        self.guidance_zero123 = None
+        self.enable_sd = False
+        self.enable_zero123 = False
+        # renderer
+        self.renderer = Renderer(opt).to(self.device)
+        # input image
+        self.input_img = None
+        self.input_mask = None
+        self.input_img_torch = None
+        self.input_mask_torch = None
+        self.overlay_input_img = False
+        self.overlay_input_img_ratio = 0.5
+        # input text
+        self.prompt = ""
+        self.negative_prompt = ""
+        # training stuff
+        self.training = False
+        self.optimizer = None
+        self.step = 0
+        self.train_steps = 1  # steps per rendering loop
+        # self.lpips_loss = LPIPS(net='vgg').to(self.device)
+        # load input data from cmdline
+        if self.opt.input is not None:
+            self.load_input(self.opt.input)
+        # override prompt from cmdline
+        if self.opt.prompt is not None:
+            self.prompt = self.opt.prompt
+        if self.gui:
+            dpg.create_context()
+            self.register_dpg()
+            self.test_step()
+    def __del__(self):
+        if self.gui:
+            dpg.destroy_context()
+    def seed_everything(self):
+        try:
+            seed = int(self.seed)
+        except:
+            seed = np.random.randint(0, 1000000)
+        os.environ["PYTHONHASHSEED"] = str(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = True
+        self.last_seed = seed
+    def prepare_train(self):
+        self.step = 0
+        # setup training
+        self.optimizer = torch.optim.Adam(self.renderer.get_params())
+        # default camera
+        pose = orbit_camera(self.opt.elevation, 0, self.opt.radius)
+        self.fixed_cam = (pose, self.cam.perspective)
+        self.enable_sd = self.opt.lambda_sd > 0 and self.prompt != ""
+        self.enable_zero123 = self.opt.lambda_zero123 > 0 and self.input_img is not None
+        # lazy load guidance model
+        if self.guidance_sd is None and self.enable_sd:
+            print(f"[INFO] loading SD...")
+            from guidance.sd_utils import StableDiffusion
+            self.guidance_sd = StableDiffusion(self.device)
+            print(f"[INFO] loaded SD!")
+        if self.guidance_zero123 is None and self.enable_zero123:
+            print(f"[INFO] loading zero123...")
+            from guidance.zero123_utils import Zero123
+            self.guidance_zero123 = Zero123(self.device)
+            print(f"[INFO] loaded zero123!")
+        # input image
+        if self.input_img is not None:
+            self.input_img_torch = torch.from_numpy(self.input_img).permute(2, 0, 1).unsqueeze(0).to(self.device)
+            self.input_img_torch = F.interpolate(
+                self.input_img_torch, (self.opt.ref_size, self.opt.ref_size), mode="bilinear", align_corners=False
+            )
+            self.input_mask_torch = torch.from_numpy(self.input_mask).permute(2, 0, 1).unsqueeze(0).to(self.device)
+            self.input_mask_torch = F.interpolate(
+                self.input_mask_torch, (self.opt.ref_size, self.opt.ref_size), mode="bilinear", align_corners=False
+            )
+            self.input_img_torch_channel_last = self.input_img_torch[0].permute(1,2,0).contiguous()
+        # prepare embeddings
+        with torch.no_grad():
+            if self.enable_sd:
+                self.guidance_sd.get_text_embeds([self.prompt], [self.negative_prompt])
+            if self.enable_zero123:
+                self.guidance_zero123.get_img_embeds(self.input_img_torch)
+    def train_step(self):
+        starter = torch.cuda.Event(enable_timing=True)
+        ender = torch.cuda.Event(enable_timing=True)
+        starter.record()
+        for _ in range(self.train_steps):
+            self.step += 1
+            step_ratio = min(1, self.step / self.opt.iters_refine)
+            loss = 0
+            ### known view
+            if self.input_img_torch is not None:
+                ssaa = min(2.0, max(0.125, 2 * np.random.random()))
+                out = self.renderer.render(*self.fixed_cam, self.opt.ref_size, self.opt.ref_size, ssaa=ssaa)
+                # rgb loss
+                image = out["image"] # [H, W, 3] in [0, 1]
+                valid_mask = ((out["alpha"] > 0) & (out["viewcos"] > 0.5)).detach()
+                loss = loss + F.mse_loss(image * valid_mask, self.input_img_torch_channel_last * valid_mask)
+            ### novel view (manual batch)
+            render_resolution = 512
+            images = []
+            vers, hors, radii = [], [], []
+            # avoid too large elevation (> 80 or < -80), and make sure it always cover [-30, 30]
+            min_ver = max(min(-30, -30 - self.opt.elevation), -80 - self.opt.elevation)
+            max_ver = min(max(30, 30 - self.opt.elevation), 80 - self.opt.elevation)
+            for _ in range(self.opt.batch_size):
+                # render random view
+                ver = np.random.randint(min_ver, max_ver)
+                hor = np.random.randint(-180, 180)
+                radius = 0
+                vers.append(ver)
+                hors.append(hor)
+                radii.append(radius)
+                pose = orbit_camera(self.opt.elevation + ver, hor, self.opt.radius + radius)
+                # random render resolution
+                ssaa = min(2.0, max(0.125, 2 * np.random.random()))
+                out = self.renderer.render(pose, self.cam.perspective, render_resolution, render_resolution, ssaa=ssaa)
+                image = out["image"] # [H, W, 3] in [0, 1]
+                image = image.permute(2,0,1).contiguous().unsqueeze(0) # [1, 3, H, W] in [0, 1]
+                images.append(image)
+            images = torch.cat(images, dim=0)
+            # import kiui
+            # kiui.lo(hor, ver)
+            # kiui.vis.plot_image(image)
+            # guidance loss
+            if self.enable_sd:
+                # loss = loss + self.opt.lambda_sd * self.guidance_sd.train_step(images, step_ratio)
+                refined_images = self.guidance_sd.refine(images, strength=0.6).float()
+                refined_images = F.interpolate(refined_images, (render_resolution, render_resolution), mode="bilinear", align_corners=False)
+                loss = loss + self.opt.lambda_sd * F.mse_loss(images, refined_images)
+            if self.enable_zero123:
+                # loss = loss + self.opt.lambda_zero123 * self.guidance_zero123.train_step(images, vers, hors, radii, step_ratio)
+                refined_images = self.guidance_zero123.refine(images, vers, hors, radii, strength=0.6).float()
+                refined_images = F.interpolate(refined_images, (render_resolution, render_resolution), mode="bilinear", align_corners=False)
+                loss = loss + self.opt.lambda_zero123 * F.mse_loss(images, refined_images)
+                # loss = loss + self.opt.lambda_zero123 * self.lpips_loss(images, refined_images)
+            # optimize step
+            loss.backward()
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+        ender.record()
+        torch.cuda.synchronize()
+        t = starter.elapsed_time(ender)
+        self.need_update = True
+        if self.gui:
+            dpg.set_value("_log_train_time", f"{t:.4f}ms")
+            dpg.set_value(
+                "_log_train_log",
+                f"step = {self.step: 5d} (+{self.train_steps: 2d}) loss = {loss.item():.4f}",
+            )
+        # dynamic train steps (no need for now)
+        # max allowed train time per-frame is 500 ms
+        # full_t = t / self.train_steps * 16
+        # train_steps = min(16, max(4, int(16 * 500 / full_t)))
+        # if train_steps > self.train_steps * 1.2 or train_steps < self.train_steps * 0.8:
+        #     self.train_steps = train_steps
+    @torch.no_grad()
+    def test_step(self):
+        # ignore if no need to update
+        if not self.need_update:
+            return
+        starter = torch.cuda.Event(enable_timing=True)
+        ender = torch.cuda.Event(enable_timing=True)
+        starter.record()
+        # should update image
+        if self.need_update:
+            # render image
+            out = self.renderer.render(self.cam.pose, self.cam.perspective, self.H, self.W)
+            buffer_image = out[self.mode]  # [H, W, 3]
+            if self.mode in ['depth', 'alpha']:
+                buffer_image = buffer_image.repeat(1, 1, 3)
+                if self.mode == 'depth':
+                    buffer_image = (buffer_image - buffer_image.min()) / (buffer_image.max() - buffer_image.min() + 1e-20)
+            self.buffer_image = buffer_image.contiguous().clamp(0, 1).detach().cpu().numpy()
+            # display input_image
+            if self.overlay_input_img and self.input_img is not None:
+                self.buffer_image = (
+                    self.buffer_image * (1 - self.overlay_input_img_ratio)
+                    + self.input_img * self.overlay_input_img_ratio
+                )
+            self.need_update = False
+        ender.record()
+        torch.cuda.synchronize()
+        t = starter.elapsed_time(ender)
+        if self.gui:
+            dpg.set_value("_log_infer_time", f"{t:.4f}ms ({int(1000/t)} FPS)")
+            dpg.set_value(
+                "_texture", self.buffer_image
+            )  # buffer must be contiguous, else seg fault!
+    def load_input(self, file):
+        # load image
+        print(f'[INFO] load image from {file}...')
+        img = cv2.imread(file, cv2.IMREAD_UNCHANGED)
+        if img.shape[-1] == 3:
+            if self.bg_remover is None:
+                self.bg_remover = rembg.new_session()
+            img = rembg.remove(img, session=self.bg_remover)
+        img = cv2.resize(
+            img, (self.W, self.H), interpolation=cv2.INTER_AREA
+        )
+        img = img.astype(np.float32) / 255.0
+        self.input_mask = img[..., 3:]
+        # white bg
+        self.input_img = img[..., :3] * self.input_mask + (
+            1 - self.input_mask
+        )
+        # bgr to rgb
+        self.input_img = self.input_img[..., ::-1].copy()
+        # load prompt
+        file_prompt = file.replace("_rgba.png", "_caption.txt")
+        if os.path.exists(file_prompt):
+            print(f'[INFO] load prompt from {file_prompt}...')
+            with open(file_prompt, "r") as f:
+                self.prompt = f.read().strip()
+    def save_model(self):
+        os.makedirs(self.opt.outdir, exist_ok=True)
+        path = os.path.join(self.opt.outdir, self.opt.save_path + '.' + self.opt.mesh_format)
+        self.renderer.export_mesh(path)
+        print(f"[INFO] save model to {path}.")
+    def register_dpg(self):
+        ### register texture
+        with dpg.texture_registry(show=False):
+            dpg.add_raw_texture(
+                self.W,
+                self.H,
+                self.buffer_image,
+                format=dpg.mvFormat_Float_rgb,
+                tag="_texture",
+            )
+        ### register window
+        # the rendered image, as the primary window
+        with dpg.window(
+            tag="_primary_window",
+            width=self.W,
+            height=self.H,
+            pos=[0, 0],
+            no_move=True,
+            no_title_bar=True,
+            no_scrollbar=True,
+        ):
+            # add the texture
+            dpg.add_image("_texture")
+        # dpg.set_primary_window("_primary_window", True)
+        # control window
+        with dpg.window(
+            label="Control",
+            tag="_control_window",
+            width=600,
+            height=self.H,
+            pos=[self.W, 0],
+            no_move=True,
+            no_title_bar=True,
+        ):
+            # button theme
+            with dpg.theme() as theme_button:
+                with dpg.theme_component(dpg.mvButton):
+                    dpg.add_theme_color(dpg.mvThemeCol_Button, (23, 3, 18))
+                    dpg.add_theme_color(dpg.mvThemeCol_ButtonHovered, (51, 3, 47))
+                    dpg.add_theme_color(dpg.mvThemeCol_ButtonActive, (83, 18, 83))
+                    dpg.add_theme_style(dpg.mvStyleVar_FrameRounding, 5)
+                    dpg.add_theme_style(dpg.mvStyleVar_FramePadding, 3, 3)
+            # timer stuff
+            with dpg.group(horizontal=True):
+                dpg.add_text("Infer time: ")
+                dpg.add_text("no data", tag="_log_infer_time")
+            def callback_setattr(sender, app_data, user_data):
+                setattr(self, user_data, app_data)
+            # init stuff
+            with dpg.collapsing_header(label="Initialize", default_open=True):
+                # seed stuff
+                def callback_set_seed(sender, app_data):
+                    self.seed = app_data
+                    self.seed_everything()
+                dpg.add_input_text(
+                    label="seed",
+                    default_value=self.seed,
+                    on_enter=True,
+                    callback=callback_set_seed,
+                )
+                # input stuff
+                def callback_select_input(sender, app_data):
+                    # only one item
+                    for k, v in app_data["selections"].items():
+                        dpg.set_value("_log_input", k)
+                        self.load_input(v)
+                    self.need_update = True
+                with dpg.file_dialog(
+                    directory_selector=False,
+                    show=False,
+                    callback=callback_select_input,
+                    file_count=1,
+                    tag="file_dialog_tag",
+                    width=700,
+                    height=400,
+                ):
+                    dpg.add_file_extension("Images{.jpg,.jpeg,.png}")
+                with dpg.group(horizontal=True):
+                    dpg.add_button(
+                        label="input",
+                        callback=lambda: dpg.show_item("file_dialog_tag"),
+                    )
+                    dpg.add_text("", tag="_log_input")
+                # overlay stuff
+                with dpg.group(horizontal=True):
+                    def callback_toggle_overlay_input_img(sender, app_data):
+                        self.overlay_input_img = not self.overlay_input_img
+                        self.need_update = True
+                    dpg.add_checkbox(
+                        label="overlay image",
+                        default_value=self.overlay_input_img,
+                        callback=callback_toggle_overlay_input_img,
+                    )
+                    def callback_set_overlay_input_img_ratio(sender, app_data):
+                        self.overlay_input_img_ratio = app_data
+                        self.need_update = True
+                    dpg.add_slider_float(
+                        label="ratio",
+                        min_value=0,
+                        max_value=1,
+                        format="%.1f",
+                        default_value=self.overlay_input_img_ratio,
+                        callback=callback_set_overlay_input_img_ratio,
+                    )
+                # prompt stuff
+                dpg.add_input_text(
+                    label="prompt",
+                    default_value=self.prompt,
+                    callback=callback_setattr,
+                    user_data="prompt",
+                )
+                dpg.add_input_text(
+                    label="negative",
+                    default_value=self.negative_prompt,
+                    callback=callback_setattr,
+                    user_data="negative_prompt",
+                )
+                # save current model
+                with dpg.group(horizontal=True):
+                    dpg.add_text("Save: ")
+                    dpg.add_button(
+                        label="model",
+                        tag="_button_save_model",
+                        callback=self.save_model,
+                    )
+                    dpg.bind_item_theme("_button_save_model", theme_button)
+                    dpg.add_input_text(
+                        label="",
+                        default_value=self.opt.save_path,
+                        callback=callback_setattr,
+                        user_data="save_path",
+                    )
+            # training stuff
+            with dpg.collapsing_header(label="Train", default_open=True):
+                # lr and train button
+                with dpg.group(horizontal=True):
+                    dpg.add_text("Train: ")
+                    def callback_train(sender, app_data):
+                        if self.training:
+                            self.training = False
+                            dpg.configure_item("_button_train", label="start")
+                        else:
+                            self.prepare_train()
+                            self.training = True
+                            dpg.configure_item("_button_train", label="stop")
+                    # dpg.add_button(
+                    #     label="init", tag="_button_init", callback=self.prepare_train
+                    # )
+                    # dpg.bind_item_theme("_button_init", theme_button)
+                    dpg.add_button(
+                        label="start", tag="_button_train", callback=callback_train
+                    )
+                    dpg.bind_item_theme("_button_train", theme_button)
+                with dpg.group(horizontal=True):
+                    dpg.add_text("", tag="_log_train_time")
+                    dpg.add_text("", tag="_log_train_log")
+            # rendering options
+            with dpg.collapsing_header(label="Rendering", default_open=True):
+                # mode combo
+                def callback_change_mode(sender, app_data):
+                    self.mode = app_data
+                    self.need_update = True
+                dpg.add_combo(
+                    ("image", "depth", "alpha", "normal"),
+                    label="mode",
+                    default_value=self.mode,
+                    callback=callback_change_mode,
+                )
+                # fov slider
+                def callback_set_fovy(sender, app_data):
+                    self.cam.fovy = np.deg2rad(app_data)
+                    self.need_update = True
+                dpg.add_slider_int(
+                    label="FoV (vertical)",
+                    min_value=1,
+                    max_value=120,
+                    format="%d deg",
+                    default_value=np.rad2deg(self.cam.fovy),
+                    callback=callback_set_fovy,
+                )
+        ### register camera handler
+        def callback_camera_drag_rotate_or_draw_mask(sender, app_data):
+            if not dpg.is_item_focused("_primary_window"):
+                return
+            dx = app_data[1]
+            dy = app_data[2]
+            self.cam.orbit(dx, dy)
+            self.need_update = True
+        def callback_camera_wheel_scale(sender, app_data):
+            if not dpg.is_item_focused("_primary_window"):
+                return
+            delta = app_data
+            self.cam.scale(delta)
+            self.need_update = True
+        def callback_camera_drag_pan(sender, app_data):
+            if not dpg.is_item_focused("_primary_window"):
+                return
+            dx = app_data[1]
+            dy = app_data[2]
+            self.cam.pan(dx, dy)
+            self.need_update = True
+        def callback_set_mouse_loc(sender, app_data):
+            if not dpg.is_item_focused("_primary_window"):
+                return
+            # just the pixel coordinate in image
+            self.mouse_loc = np.array(app_data)
+        with dpg.handler_registry():
+            # for camera moving
+            dpg.add_mouse_drag_handler(
+                button=dpg.mvMouseButton_Left,
+                callback=callback_camera_drag_rotate_or_draw_mask,
+            )
+            dpg.add_mouse_wheel_handler(callback=callback_camera_wheel_scale)
+            dpg.add_mouse_drag_handler(
+                button=dpg.mvMouseButton_Middle, callback=callback_camera_drag_pan
+            )
+        dpg.create_viewport(
+            title="Gaussian3D",
+            width=self.W + 600,
+            height=self.H + (45 if os.name == "nt" else 0),
+            resizable=False,
+        )
+        ### global theme
+        with dpg.theme() as theme_no_padding:
+            with dpg.theme_component(dpg.mvAll):
+                # set all padding to 0 to avoid scroll bar
+                dpg.add_theme_style(
+                    dpg.mvStyleVar_WindowPadding, 0, 0, category=dpg.mvThemeCat_Core
+                )
+                dpg.add_theme_style(
+                    dpg.mvStyleVar_FramePadding, 0, 0, category=dpg.mvThemeCat_Core
+                )
+                dpg.add_theme_style(
+                    dpg.mvStyleVar_CellPadding, 0, 0, category=dpg.mvThemeCat_Core
+                )
+        dpg.bind_item_theme("_primary_window", theme_no_padding)
+        dpg.setup_dearpygui()
+        ### register a larger font
+        # get it from: https://github.com/lxgw/LxgwWenKai/releases/download/v1.300/LXGWWenKai-Regular.ttf
+        if os.path.exists("LXGWWenKai-Regular.ttf"):
+            with dpg.font_registry():
+                with dpg.font("LXGWWenKai-Regular.ttf", 18) as default_font:
+                    dpg.bind_font(default_font)
+        # dpg.show_metrics()
+        dpg.show_viewport()
+    def render(self):
+        assert self.gui
+        while dpg.is_dearpygui_running():
+            # update texture every frame
+            if self.training:
+                self.train_step()
+            self.test_step()
+            dpg.render_dearpygui_frame()
+    # no gui mode
+    def train(self, iters=500):
+        if iters > 0:
+            self.prepare_train()
+            for i in tqdm.trange(iters):
+                self.train_step()
+        # save
+        self.save_model()
+if __name__ == "__main__":
+    import argparse
+    from omegaconf import OmegaConf
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True, help="path to the yaml config file")
+    args, extras = parser.parse_known_args()
+    # override default config from cli
+    opt = OmegaConf.merge(OmegaConf.load(args.config), OmegaConf.from_cli(extras))
+    # auto find mesh from stage 1
+    if opt.mesh is None:
+        default_path = os.path.join(opt.outdir, opt.save_path + '_mesh.' + opt.mesh_format)
+        if os.path.exists(default_path):
+            opt.mesh = default_path
+        else:
+            raise ValueError(f"Cannot find mesh from {default_path}, must specify --mesh explicitly!")
+    gui = GUI(opt)
+    if opt.gui:
+        gui.render()
+    else:
+        gui.train(opt.iters_refine)

mesh.py ADDED Viewed

	@@ -0,0 +1,622 @@

+import os
+import cv2
+import torch
+import trimesh
+import numpy as np
+def dot(x, y):
+    return torch.sum(x * y, -1, keepdim=True)
+def length(x, eps=1e-20):
+    return torch.sqrt(torch.clamp(dot(x, x), min=eps))
+def safe_normalize(x, eps=1e-20):
+    return x / length(x, eps)
+class Mesh:
+    def __init__(
+        self,
+        v=None,
+        f=None,
+        vn=None,
+        fn=None,
+        vt=None,
+        ft=None,
+        albedo=None,
+        vc=None, # vertex color
+        device=None,
+    ):
+        self.device = device
+        self.v = v
+        self.vn = vn
+        self.vt = vt
+        self.f = f
+        self.fn = fn
+        self.ft = ft
+        # only support a single albedo
+        self.albedo = albedo
+        # support vertex color is no albedo
+        self.vc = vc
+        self.ori_center = 0
+        self.ori_scale = 1
+    @classmethod
+    def load(cls, path=None, resize=True, renormal=True, retex=False, front_dir='+z', **kwargs):
+        # assume init with kwargs
+        if path is None:
+            mesh = cls(**kwargs)
+        # obj supports face uv
+        elif path.endswith(".obj"):
+            mesh = cls.load_obj(path, **kwargs)
+        # trimesh only supports vertex uv, but can load more formats
+        else:
+            mesh = cls.load_trimesh(path, **kwargs)
+        print(f"[Mesh loading] v: {mesh.v.shape}, f: {mesh.f.shape}")
+        # auto-normalize
+        if resize:
+            mesh.auto_size()
+        # auto-fix normal
+        if renormal or mesh.vn is None:
+            mesh.auto_normal()
+            print(f"[Mesh loading] vn: {mesh.vn.shape}, fn: {mesh.fn.shape}")
+        # auto-fix texcoords
+        if retex or (mesh.albedo is not None and mesh.vt is None):
+            mesh.auto_uv(cache_path=path)
+            print(f"[Mesh loading] vt: {mesh.vt.shape}, ft: {mesh.ft.shape}")
+        # rotate front dir to +z
+        if front_dir != "+z":
+            # axis switch
+            if "-z" in front_dir:
+                T = torch.tensor([[1, 0, 0], [0, 1, 0], [0, 0, -1]], device=mesh.device, dtype=torch.float32)
+            elif "+x" in front_dir:
+                T = torch.tensor([[0, 0, 1], [0, 1, 0], [1, 0, 0]], device=mesh.device, dtype=torch.float32)
+            elif "-x" in front_dir:
+                T = torch.tensor([[0, 0, -1], [0, 1, 0], [1, 0, 0]], device=mesh.device, dtype=torch.float32)
+            elif "+y" in front_dir:
+                T = torch.tensor([[1, 0, 0], [0, 0, 1], [0, 1, 0]], device=mesh.device, dtype=torch.float32)
+            elif "-y" in front_dir:
+                T = torch.tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]], device=mesh.device, dtype=torch.float32)
+            else:
+                T = torch.tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]], device=mesh.device, dtype=torch.float32)
+            # rotation (how many 90 degrees)
+            if '1' in front_dir:
+                T @= torch.tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]], device=mesh.device, dtype=torch.float32)
+            elif '2' in front_dir:
+                T @= torch.tensor([[1, 0, 0], [0, -1, 0], [0, 0, 1]], device=mesh.device, dtype=torch.float32)
+            elif '3' in front_dir:
+                T @= torch.tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]], device=mesh.device, dtype=torch.float32)
+            mesh.v @= T
+            mesh.vn @= T
+        return mesh
+    # load from obj file
+    @classmethod
+    def load_obj(cls, path, albedo_path=None, device=None):
+        assert os.path.splitext(path)[-1] == ".obj"
+        mesh = cls()
+        # device
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        mesh.device = device
+        # load obj
+        with open(path, "r") as f:
+            lines = f.readlines()
+        def parse_f_v(fv):
+            # pass in a vertex term of a face, return {v, vt, vn} (-1 if not provided)
+            # supported forms:
+            # f v1 v2 v3
+            # f v1/vt1 v2/vt2 v3/vt3
+            # f v1/vt1/vn1 v2/vt2/vn2 v3/vt3/vn3
+            # f v1//vn1 v2//vn2 v3//vn3
+            xs = [int(x) - 1 if x != "" else -1 for x in fv.split("/")]
+            xs.extend([-1] * (3 - len(xs)))
+            return xs[0], xs[1], xs[2]
+        # NOTE: we ignore usemtl, and assume the mesh ONLY uses one material (first in mtl)
+        vertices, texcoords, normals = [], [], []
+        faces, tfaces, nfaces = [], [], []
+        mtl_path = None
+        for line in lines:
+            split_line = line.split()
+            # empty line
+            if len(split_line) == 0:
+                continue
+            prefix = split_line[0].lower()
+            # mtllib
+            if prefix == "mtllib":
+                mtl_path = split_line[1]
+            # usemtl
+            elif prefix == "usemtl":
+                pass # ignored
+            # v/vn/vt
+            elif prefix == "v":
+                vertices.append([float(v) for v in split_line[1:]])
+            elif prefix == "vn":
+                normals.append([float(v) for v in split_line[1:]])
+            elif prefix == "vt":
+                val = [float(v) for v in split_line[1:]]
+                texcoords.append([val[0], 1.0 - val[1]])
+            elif prefix == "f":
+                vs = split_line[1:]
+                nv = len(vs)
+                v0, t0, n0 = parse_f_v(vs[0])
+                for i in range(nv - 2):  # triangulate (assume vertices are ordered)
+                    v1, t1, n1 = parse_f_v(vs[i + 1])
+                    v2, t2, n2 = parse_f_v(vs[i + 2])
+                    faces.append([v0, v1, v2])
+                    tfaces.append([t0, t1, t2])
+                    nfaces.append([n0, n1, n2])
+        mesh.v = torch.tensor(vertices, dtype=torch.float32, device=device)
+        mesh.vt = (
+            torch.tensor(texcoords, dtype=torch.float32, device=device)
+            if len(texcoords) > 0
+            else None
+        )
+        mesh.vn = (
+            torch.tensor(normals, dtype=torch.float32, device=device)
+            if len(normals) > 0
+            else None
+        )
+        mesh.f = torch.tensor(faces, dtype=torch.int32, device=device)
+        mesh.ft = (
+            torch.tensor(tfaces, dtype=torch.int32, device=device)
+            if len(texcoords) > 0
+            else None
+        )
+        mesh.fn = (
+            torch.tensor(nfaces, dtype=torch.int32, device=device)
+            if len(normals) > 0
+            else None
+        )
+        # see if there is vertex color
+        use_vertex_color = False
+        if mesh.v.shape[1] == 6:
+            use_vertex_color = True
+            mesh.vc = mesh.v[:, 3:]
+            mesh.v = mesh.v[:, :3]
+            print(f"[load_obj] use vertex color: {mesh.vc.shape}")
+        # try to load texture image
+        if not use_vertex_color:
+            # try to retrieve mtl file
+            mtl_path_candidates = []
+            if mtl_path is not None:
+                mtl_path_candidates.append(mtl_path)
+                mtl_path_candidates.append(os.path.join(os.path.dirname(path), mtl_path))
+            mtl_path_candidates.append(path.replace(".obj", ".mtl"))
+            mtl_path = None
+            for candidate in mtl_path_candidates:
+                if os.path.exists(candidate):
+                    mtl_path = candidate
+                    break
+            # if albedo_path is not provided, try retrieve it from mtl
+            if mtl_path is not None and albedo_path is None:
+                with open(mtl_path, "r") as f:
+                    lines = f.readlines()
+                for line in lines:
+                    split_line = line.split()
+                    # empty line
+                    if len(split_line) == 0:
+                        continue
+                    prefix = split_line[0]
+                    # NOTE: simply use the first map_Kd as albedo!
+                    if "map_Kd" in prefix:
+                        albedo_path = os.path.join(os.path.dirname(path), split_line[1])
+                        print(f"[load_obj] use texture from: {albedo_path}")
+                        break
+            # still not found albedo_path, or the path doesn't exist
+            if albedo_path is None or not os.path.exists(albedo_path):
+                # init an empty texture
+                print(f"[load_obj] init empty albedo!")
+                # albedo = np.random.rand(1024, 1024, 3).astype(np.float32)
+                albedo = np.ones((1024, 1024, 3), dtype=np.float32) * np.array([0.5, 0.5, 0.5])  # default color
+            else:
+                albedo = cv2.imread(albedo_path, cv2.IMREAD_UNCHANGED)
+                albedo = cv2.cvtColor(albedo, cv2.COLOR_BGR2RGB)
+                albedo = albedo.astype(np.float32) / 255
+                print(f"[load_obj] load texture: {albedo.shape}")
+                # import matplotlib.pyplot as plt
+                # plt.imshow(albedo)
+                # plt.show()
+            mesh.albedo = torch.tensor(albedo, dtype=torch.float32, device=device)
+        return mesh
+    @classmethod
+    def load_trimesh(cls, path, device=None):
+        mesh = cls()
+        # device
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        mesh.device = device
+        # use trimesh to load ply/glb, assume only has one single RootMesh...
+        _data = trimesh.load(path)
+        if isinstance(_data, trimesh.Scene):
+            if len(_data.geometry) == 1:
+                _mesh = list(_data.geometry.values())[0]
+            else:
+                # manual concat, will lose texture
+                _concat = []
+                for g in _data.geometry.values():
+                    if isinstance(g, trimesh.Trimesh):
+                        _concat.append(g)
+                _mesh = trimesh.util.concatenate(_concat)
+        else:
+            _mesh = _data
+        if _mesh.visual.kind == 'vertex':
+            vertex_colors = _mesh.visual.vertex_colors
+            vertex_colors = np.array(vertex_colors[..., :3]).astype(np.float32) / 255
+            mesh.vc = torch.tensor(vertex_colors, dtype=torch.float32, device=device)
+            print(f"[load_trimesh] use vertex color: {mesh.vc.shape}")
+        elif _mesh.visual.kind == 'texture':
+            _material = _mesh.visual.material
+            if isinstance(_material, trimesh.visual.material.PBRMaterial):
+                texture = np.array(_material.baseColorTexture).astype(np.float32) / 255
+            elif isinstance(_material, trimesh.visual.material.SimpleMaterial):
+                texture = np.array(_material.to_pbr().baseColorTexture).astype(np.float32) / 255
+            else:
+                raise NotImplementedError(f"material type {type(_material)} not supported!")
+            mesh.albedo = torch.tensor(texture, dtype=torch.float32, device=device)
+            print(f"[load_trimesh] load texture: {texture.shape}")
+        else:
+            texture = np.ones((1024, 1024, 3), dtype=np.float32) * np.array([0.5, 0.5, 0.5])
+            mesh.albedo = torch.tensor(texture, dtype=torch.float32, device=device)
+            print(f"[load_trimesh] failed to load texture.")
+        vertices = _mesh.vertices
+        try:
+            texcoords = _mesh.visual.uv
+            texcoords[:, 1] = 1 - texcoords[:, 1]
+        except Exception as e:
+            texcoords = None
+        try:
+            normals = _mesh.vertex_normals
+        except Exception as e:
+            normals = None
+        # trimesh only support vertex uv...
+        faces = tfaces = nfaces = _mesh.faces
+        mesh.v = torch.tensor(vertices, dtype=torch.float32, device=device)
+        mesh.vt = (
+            torch.tensor(texcoords, dtype=torch.float32, device=device)
+            if texcoords is not None
+            else None
+        )
+        mesh.vn = (
+            torch.tensor(normals, dtype=torch.float32, device=device)
+            if normals is not None
+            else None
+        )
+        mesh.f = torch.tensor(faces, dtype=torch.int32, device=device)
+        mesh.ft = (
+            torch.tensor(tfaces, dtype=torch.int32, device=device)
+            if texcoords is not None
+            else None
+        )
+        mesh.fn = (
+            torch.tensor(nfaces, dtype=torch.int32, device=device)
+            if normals is not None
+            else None
+        )
+        return mesh
+    # aabb
+    def aabb(self):
+        return torch.min(self.v, dim=0).values, torch.max(self.v, dim=0).values
+    # unit size
+    @torch.no_grad()
+    def auto_size(self):
+        vmin, vmax = self.aabb()
+        self.ori_center = (vmax + vmin) / 2
+        self.ori_scale = 1.2 / torch.max(vmax - vmin).item()
+        self.v = (self.v - self.ori_center) * self.ori_scale
+    def auto_normal(self):
+        i0, i1, i2 = self.f[:, 0].long(), self.f[:, 1].long(), self.f[:, 2].long()
+        v0, v1, v2 = self.v[i0, :], self.v[i1, :], self.v[i2, :]
+        face_normals = torch.cross(v1 - v0, v2 - v0)
+        # Splat face normals to vertices
+        vn = torch.zeros_like(self.v)
+        vn.scatter_add_(0, i0[:, None].repeat(1, 3), face_normals)
+        vn.scatter_add_(0, i1[:, None].repeat(1, 3), face_normals)
+        vn.scatter_add_(0, i2[:, None].repeat(1, 3), face_normals)
+        # Normalize, replace zero (degenerated) normals with some default value
+        vn = torch.where(
+            dot(vn, vn) > 1e-20,
+            vn,
+            torch.tensor([0.0, 0.0, 1.0], dtype=torch.float32, device=vn.device),
+        )
+        vn = safe_normalize(vn)
+        self.vn = vn
+        self.fn = self.f
+    def auto_uv(self, cache_path=None, vmap=True):
+        # try to load cache
+        if cache_path is not None:
+            cache_path = os.path.splitext(cache_path)[0] + "_uv.npz"
+        if cache_path is not None and os.path.exists(cache_path):
+            data = np.load(cache_path)
+            vt_np, ft_np, vmapping = data["vt"], data["ft"], data["vmapping"]
+        else:
+            import xatlas
+            v_np = self.v.detach().cpu().numpy()
+            f_np = self.f.detach().int().cpu().numpy()
+            atlas = xatlas.Atlas()
+            atlas.add_mesh(v_np, f_np)
+            chart_options = xatlas.ChartOptions()
+            # chart_options.max_iterations = 4
+            atlas.generate(chart_options=chart_options)
+            vmapping, ft_np, vt_np = atlas[0]  # [N], [M, 3], [N, 2]
+            # save to cache
+            if cache_path is not None:
+                np.savez(cache_path, vt=vt_np, ft=ft_np, vmapping=vmapping)
+        vt = torch.from_numpy(vt_np.astype(np.float32)).to(self.device)
+        ft = torch.from_numpy(ft_np.astype(np.int32)).to(self.device)
+        self.vt = vt
+        self.ft = ft
+        if vmap:
+            # remap v/f to vt/ft, so each v correspond to a unique vt. (necessary for gltf)
+            vmapping = torch.from_numpy(vmapping.astype(np.int64)).long().to(self.device)
+            self.align_v_to_vt(vmapping)
+    def align_v_to_vt(self, vmapping=None):
+        # remap v/f and vn/vn to vt/ft.
+        if vmapping is None:
+            ft = self.ft.view(-1).long()
+            f = self.f.view(-1).long()
+            vmapping = torch.zeros(self.vt.shape[0], dtype=torch.long, device=self.device)
+            vmapping[ft] = f # scatter, randomly choose one if index is not unique
+        self.v = self.v[vmapping]
+        self.f = self.ft
+        # assume fn == f
+        if self.vn is not None:
+            self.vn = self.vn[vmapping]
+            self.fn = self.ft
+    def to(self, device):
+        self.device = device
+        for name in ["v", "f", "vn", "fn", "vt", "ft", "albedo"]:
+            tensor = getattr(self, name)
+            if tensor is not None:
+                setattr(self, name, tensor.to(device))
+        return self
+    def write(self, path):
+        if path.endswith(".ply"):
+            self.write_ply(path)
+        elif path.endswith(".obj"):
+            self.write_obj(path)
+        elif path.endswith(".glb") or path.endswith(".gltf"):
+            self.write_glb(path)
+        else:
+            raise NotImplementedError(f"format {path} not supported!")
+    # write to ply file (only geom)
+    def write_ply(self, path):
+        v_np = self.v.detach().cpu().numpy()
+        f_np = self.f.detach().cpu().numpy()
+        _mesh = trimesh.Trimesh(vertices=v_np, faces=f_np)
+        _mesh.export(path)
+    # write to gltf/glb file (geom + texture)
+    def write_glb(self, path):
+        assert self.vn is not None and self.vt is not None # should be improved to support export without texture...
+        # assert self.v.shape[0] == self.vn.shape[0] and self.v.shape[0] == self.vt.shape[0]
+        if self.v.shape[0] != self.vt.shape[0]:
+            self.align_v_to_vt()
+        # assume f == fn == ft
+        import pygltflib
+        f_np = self.f.detach().cpu().numpy().astype(np.uint32)
+        v_np = self.v.detach().cpu().numpy().astype(np.float32)
+        # vn_np = self.vn.detach().cpu().numpy().astype(np.float32)
+        vt_np = self.vt.detach().cpu().numpy().astype(np.float32)
+        albedo = self.albedo.detach().cpu().numpy()
+        albedo = (albedo * 255).astype(np.uint8)
+        albedo = cv2.cvtColor(albedo, cv2.COLOR_RGB2BGR)
+        f_np_blob = f_np.flatten().tobytes()
+        v_np_blob = v_np.tobytes()
+        # vn_np_blob = vn_np.tobytes()
+        vt_np_blob = vt_np.tobytes()
+        albedo_blob = cv2.imencode('.png', albedo)[1].tobytes()
+        gltf = pygltflib.GLTF2(
+            scene=0,
+            scenes=[pygltflib.Scene(nodes=[0])],
+            nodes=[pygltflib.Node(mesh=0)],
+            meshes=[pygltflib.Mesh(primitives=[
+                pygltflib.Primitive(
+                    # indices to accessors (0 is triangles)
+                    attributes=pygltflib.Attributes(
+                        POSITION=1, TEXCOORD_0=2,
+                    ),
+                    indices=0, material=0,
+                )
+            ])],
+            materials=[
+                pygltflib.Material(
+                    pbrMetallicRoughness=pygltflib.PbrMetallicRoughness(
+                        baseColorTexture=pygltflib.TextureInfo(index=0, texCoord=0),
+                        metallicFactor=0.0,
+                        roughnessFactor=1.0,
+                    ),
+                    alphaCutoff=0,
+                    doubleSided=True,
+                )
+            ],
+            textures=[
+                pygltflib.Texture(sampler=0, source=0),
+            ],
+            samplers=[
+                pygltflib.Sampler(magFilter=pygltflib.LINEAR, minFilter=pygltflib.LINEAR_MIPMAP_LINEAR, wrapS=pygltflib.REPEAT, wrapT=pygltflib.REPEAT),
+            ],
+            images=[
+                # use embedded (buffer) image
+                pygltflib.Image(bufferView=3, mimeType="image/png"),
+            ],
+            buffers=[
+                pygltflib.Buffer(byteLength=len(f_np_blob) + len(v_np_blob) + len(vt_np_blob) + len(albedo_blob))
+            ],
+            # buffer view (based on dtype)
+            bufferViews=[
+                # triangles; as flatten (element) array
+                pygltflib.BufferView(
+                    buffer=0,
+                    byteLength=len(f_np_blob),
+                    target=pygltflib.ELEMENT_ARRAY_BUFFER, # GL_ELEMENT_ARRAY_BUFFER (34963)
+                ),
+                # positions; as vec3 array
+                pygltflib.BufferView(
+                    buffer=0,
+                    byteOffset=len(f_np_blob),
+                    byteLength=len(v_np_blob),
+                    byteStride=12, # vec3
+                    target=pygltflib.ARRAY_BUFFER, # GL_ARRAY_BUFFER (34962)
+                ),
+                # texcoords; as vec2 array
+                pygltflib.BufferView(
+                    buffer=0,
+                    byteOffset=len(f_np_blob) + len(v_np_blob),
+                    byteLength=len(vt_np_blob),
+                    byteStride=8, # vec2
+                    target=pygltflib.ARRAY_BUFFER,
+                ),
+                # texture; as none target
+                pygltflib.BufferView(
+                    buffer=0,
+                    byteOffset=len(f_np_blob) + len(v_np_blob) + len(vt_np_blob),
+                    byteLength=len(albedo_blob),
+                ),
+            ],
+            accessors=[
+                # 0 = triangles
+                pygltflib.Accessor(
+                    bufferView=0,
+                    componentType=pygltflib.UNSIGNED_INT, # GL_UNSIGNED_INT (5125)
+                    count=f_np.size,
+                    type=pygltflib.SCALAR,
+                    max=[int(f_np.max())],
+                    min=[int(f_np.min())],
+                ),
+                # 1 = positions
+                pygltflib.Accessor(
+                    bufferView=1,
+                    componentType=pygltflib.FLOAT, # GL_FLOAT (5126)
+                    count=len(v_np),
+                    type=pygltflib.VEC3,
+                    max=v_np.max(axis=0).tolist(),
+                    min=v_np.min(axis=0).tolist(),
+                ),
+                # 2 = texcoords
+                pygltflib.Accessor(
+                    bufferView=2,
+                    componentType=pygltflib.FLOAT,
+                    count=len(vt_np),
+                    type=pygltflib.VEC2,
+                    max=vt_np.max(axis=0).tolist(),
+                    min=vt_np.min(axis=0).tolist(),
+                ),
+            ],
+        )
+        # set actual data
+        gltf.set_binary_blob(f_np_blob + v_np_blob + vt_np_blob + albedo_blob)
+        # glb = b"".join(gltf.save_to_bytes())
+        gltf.save(path)
+    # write to obj file (geom + texture)
+    def write_obj(self, path):
+        mtl_path = path.replace(".obj", ".mtl")
+        albedo_path = path.replace(".obj", "_albedo.png")
+        v_np = self.v.detach().cpu().numpy()
+        vt_np = self.vt.detach().cpu().numpy() if self.vt is not None else None
+        vn_np = self.vn.detach().cpu().numpy() if self.vn is not None else None
+        f_np = self.f.detach().cpu().numpy()
+        ft_np = self.ft.detach().cpu().numpy() if self.ft is not None else None
+        fn_np = self.fn.detach().cpu().numpy() if self.fn is not None else None
+        with open(path, "w") as fp:
+            fp.write(f"mtllib {os.path.basename(mtl_path)} \n")
+            for v in v_np:
+                fp.write(f"v {v[0]} {v[1]} {v[2]} \n")
+            if vt_np is not None:
+                for v in vt_np:
+                    fp.write(f"vt {v[0]} {1 - v[1]} \n")
+            if vn_np is not None:
+                for v in vn_np:
+                    fp.write(f"vn {v[0]} {v[1]} {v[2]} \n")
+            fp.write(f"usemtl defaultMat \n")
+            for i in range(len(f_np)):
+                fp.write(
+                    f'f {f_np[i, 0] + 1}/{ft_np[i, 0] + 1 if ft_np is not None else ""}/{fn_np[i, 0] + 1 if fn_np is not None else ""} \
+                             {f_np[i, 1] + 1}/{ft_np[i, 1] + 1 if ft_np is not None else ""}/{fn_np[i, 1] + 1 if fn_np is not None else ""} \
+                             {f_np[i, 2] + 1}/{ft_np[i, 2] + 1 if ft_np is not None else ""}/{fn_np[i, 2] + 1 if fn_np is not None else ""} \n'
+                )
+        with open(mtl_path, "w") as fp:
+            fp.write(f"newmtl defaultMat \n")
+            fp.write(f"Ka 1 1 1 \n")
+            fp.write(f"Kd 1 1 1 \n")
+            fp.write(f"Ks 0 0 0 \n")
+            fp.write(f"Tr 1 \n")
+            fp.write(f"illum 1 \n")
+            fp.write(f"Ns 0 \n")
+            fp.write(f"map_Kd {os.path.basename(albedo_path)} \n")
+        albedo = self.albedo.detach().cpu().numpy()
+        albedo = (albedo * 255).astype(np.uint8)
+        cv2.imwrite(albedo_path, cv2.cvtColor(albedo, cv2.COLOR_RGB2BGR))

mesh_renderer.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import os
+import math
+import cv2
+import trimesh
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import nvdiffrast.torch as dr
+from mesh import Mesh, safe_normalize
+def scale_img_nhwc(x, size, mag='bilinear', min='bilinear'):
+    assert (x.shape[1] >= size[0] and x.shape[2] >= size[1]) or (x.shape[1] < size[0] and x.shape[2] < size[1]), "Trying to magnify image in one dimension and minify in the other"
+    y = x.permute(0, 3, 1, 2) # NHWC -> NCHW
+    if x.shape[1] > size[0] and x.shape[2] > size[1]: # Minification, previous size was bigger
+        y = torch.nn.functional.interpolate(y, size, mode=min)
+    else: # Magnification
+        if mag == 'bilinear' or mag == 'bicubic':
+            y = torch.nn.functional.interpolate(y, size, mode=mag, align_corners=True)
+        else:
+            y = torch.nn.functional.interpolate(y, size, mode=mag)
+    return y.permute(0, 2, 3, 1).contiguous() # NCHW -> NHWC
+def scale_img_hwc(x, size, mag='bilinear', min='bilinear'):
+    return scale_img_nhwc(x[None, ...], size, mag, min)[0]
+def scale_img_nhw(x, size, mag='bilinear', min='bilinear'):
+    return scale_img_nhwc(x[..., None], size, mag, min)[..., 0]
+def scale_img_hw(x, size, mag='bilinear', min='bilinear'):
+    return scale_img_nhwc(x[None, ..., None], size, mag, min)[0, ..., 0]
+def trunc_rev_sigmoid(x, eps=1e-6):
+    x = x.clamp(eps, 1 - eps)
+    return torch.log(x / (1 - x))
+def make_divisible(x, m=8):
+    return int(math.ceil(x / m) * m)
+class Renderer(nn.Module):
+    def __init__(self, opt):
+        super().__init__()
+        self.opt = opt
+        self.mesh = Mesh.load(self.opt.mesh, resize=False)
+        if not self.opt.force_cuda_rast and (not self.opt.gui or os.name == 'nt'):
+            self.glctx = dr.RasterizeGLContext()
+        else:
+            self.glctx = dr.RasterizeCudaContext()
+        # extract trainable parameters
+        self.v_offsets = nn.Parameter(torch.zeros_like(self.mesh.v))
+        self.raw_albedo = nn.Parameter(trunc_rev_sigmoid(self.mesh.albedo))
+    def get_params(self):
+        params = [
+            {'params': self.raw_albedo, 'lr': self.opt.texture_lr},
+        ]
+        if self.opt.train_geo:
+            params.append({'params': self.v_offsets, 'lr': self.opt.geom_lr})
+        return params
+    @torch.no_grad()
+    def export_mesh(self, save_path):
+        self.mesh.v = (self.mesh.v + self.v_offsets).detach()
+        self.mesh.albedo = torch.sigmoid(self.raw_albedo.detach())
+        self.mesh.write(save_path)
+    def render(self, pose, proj, h0, w0, ssaa=1, bg_color=1, texture_filter='linear-mipmap-linear'):
+        # do super-sampling
+        if ssaa != 1:
+            h = make_divisible(h0 * ssaa, 8)
+            w = make_divisible(w0 * ssaa, 8)
+        else:
+            h, w = h0, w0
+        results = {}
+        # get v
+        if self.opt.train_geo:
+            v = self.mesh.v + self.v_offsets # [N, 3]
+        else:
+            v = self.mesh.v
+        pose = torch.from_numpy(pose.astype(np.float32)).to(v.device)
+        proj = torch.from_numpy(proj.astype(np.float32)).to(v.device)
+        # get v_clip and render rgb
+        v_cam = torch.matmul(F.pad(v, pad=(0, 1), mode='constant', value=1.0), torch.inverse(pose).T).float().unsqueeze(0)
+        v_clip = v_cam @ proj.T
+        rast, rast_db = dr.rasterize(self.glctx, v_clip, self.mesh.f, (h, w))
+        alpha = (rast[0, ..., 3:] > 0).float()
+        depth, _ = dr.interpolate(-v_cam[..., [2]], rast, self.mesh.f) # [1, H, W, 1]
+        depth = depth.squeeze(0) # [H, W, 1]
+        texc, texc_db = dr.interpolate(self.mesh.vt.unsqueeze(0).contiguous(), rast, self.mesh.ft, rast_db=rast_db, diff_attrs='all')
+        albedo = dr.texture(self.raw_albedo.unsqueeze(0), texc, uv_da=texc_db, filter_mode=texture_filter) # [1, H, W, 3]
+        albedo = torch.sigmoid(albedo)
+        # get vn and render normal
+        if self.opt.train_geo:
+            i0, i1, i2 = self.mesh.f[:, 0].long(), self.mesh.f[:, 1].long(), self.mesh.f[:, 2].long()
+            v0, v1, v2 = v[i0, :], v[i1, :], v[i2, :]
+            face_normals = torch.cross(v1 - v0, v2 - v0)
+            face_normals = safe_normalize(face_normals)
+            vn = torch.zeros_like(v)
+            vn.scatter_add_(0, i0[:, None].repeat(1,3), face_normals)
+            vn.scatter_add_(0, i1[:, None].repeat(1,3), face_normals)
+            vn.scatter_add_(0, i2[:, None].repeat(1,3), face_normals)
+            vn = torch.where(torch.sum(vn * vn, -1, keepdim=True) > 1e-20, vn, torch.tensor([0.0, 0.0, 1.0], dtype=torch.float32, device=vn.device))
+        else:
+            vn = self.mesh.vn
+        normal, _ = dr.interpolate(vn.unsqueeze(0).contiguous(), rast, self.mesh.fn)
+        normal = safe_normalize(normal[0])
+        # rotated normal (where [0, 0, 1] always faces camera)
+        rot_normal = normal @ pose[:3, :3]
+        viewcos = rot_normal[..., [2]]
+        # antialias
+        albedo = dr.antialias(albedo, rast, v_clip, self.mesh.f).squeeze(0) # [H, W, 3]
+        albedo = alpha * albedo + (1 - alpha) * bg_color
+        # ssaa
+        if ssaa != 1:
+            albedo = scale_img_hwc(albedo, (h0, w0))
+            alpha = scale_img_hwc(alpha, (h0, w0))
+            depth = scale_img_hwc(depth, (h0, w0))
+            normal = scale_img_hwc(normal, (h0, w0))
+            viewcos = scale_img_hwc(viewcos, (h0, w0))
+        results['image'] = albedo.clamp(0, 1)
+        results['alpha'] = alpha
+        results['depth'] = depth
+        results['normal'] = (normal + 1) / 2
+        results['viewcos'] = viewcos
+        return results

mesh_utils.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import numpy as np
+import pymeshlab as pml
+def poisson_mesh_reconstruction(points, normals=None):
+    # points/normals: [N, 3] np.ndarray
+    import open3d as o3d
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(points)
+    # outlier removal
+    pcd, ind = pcd.remove_statistical_outlier(nb_neighbors=20, std_ratio=10)
+    # normals
+    if normals is None:
+        pcd.estimate_normals()
+    else:
+        pcd.normals = o3d.utility.Vector3dVector(normals[ind])
+    # visualize
+    o3d.visualization.draw_geometries([pcd], point_show_normal=False)
+    mesh, densities = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(
+        pcd, depth=9
+    )
+    vertices_to_remove = densities < np.quantile(densities, 0.1)
+    mesh.remove_vertices_by_mask(vertices_to_remove)
+    # visualize
+    o3d.visualization.draw_geometries([mesh])
+    vertices = np.asarray(mesh.vertices)
+    triangles = np.asarray(mesh.triangles)
+    print(
+        f"[INFO] poisson mesh reconstruction: {points.shape} --> {vertices.shape} / {triangles.shape}"
+    )
+    return vertices, triangles
+def decimate_mesh(
+    verts, faces, target, backend="pymeshlab", remesh=False, optimalplacement=True
+):
+    # optimalplacement: default is True, but for flat mesh must turn False to prevent spike artifect.
+    _ori_vert_shape = verts.shape
+    _ori_face_shape = faces.shape
+    if backend == "pyfqmr":
+        import pyfqmr
+        solver = pyfqmr.Simplify()
+        solver.setMesh(verts, faces)
+        solver.simplify_mesh(target_count=target, preserve_border=False, verbose=False)
+        verts, faces, normals = solver.getMesh()
+    else:
+        m = pml.Mesh(verts, faces)
+        ms = pml.MeshSet()
+        ms.add_mesh(m, "mesh")  # will copy!
+        # filters
+        # ms.meshing_decimation_clustering(threshold=pml.Percentage(1))
+        ms.meshing_decimation_quadric_edge_collapse(
+            targetfacenum=int(target), optimalplacement=optimalplacement
+        )
+        if remesh:
+            # ms.apply_coord_taubin_smoothing()
+            ms.meshing_isotropic_explicit_remeshing(
+                iterations=3, targetlen=pml.Percentage(1)
+            )
+        # extract mesh
+        m = ms.current_mesh()
+        verts = m.vertex_matrix()
+        faces = m.face_matrix()
+    print(
+        f"[INFO] mesh decimation: {_ori_vert_shape} --> {verts.shape}, {_ori_face_shape} --> {faces.shape}"
+    )
+    return verts, faces
+def clean_mesh(
+    verts,
+    faces,
+    v_pct=1,
+    min_f=64,
+    min_d=20,
+    repair=True,
+    remesh=True,
+    remesh_size=0.01,
+):
+    # verts: [N, 3]
+    # faces: [N, 3]
+    _ori_vert_shape = verts.shape
+    _ori_face_shape = faces.shape
+    m = pml.Mesh(verts, faces)
+    ms = pml.MeshSet()
+    ms.add_mesh(m, "mesh")  # will copy!
+    # filters
+    ms.meshing_remove_unreferenced_vertices()  # verts not refed by any faces
+    if v_pct > 0:
+        ms.meshing_merge_close_vertices(
+            threshold=pml.Percentage(v_pct)
+        )  # 1/10000 of bounding box diagonal
+    ms.meshing_remove_duplicate_faces()  # faces defined by the same verts
+    ms.meshing_remove_null_faces()  # faces with area == 0
+    if min_d > 0:
+        ms.meshing_remove_connected_component_by_diameter(
+            mincomponentdiag=pml.Percentage(min_d)
+        )
+    if min_f > 0:
+        ms.meshing_remove_connected_component_by_face_number(mincomponentsize=min_f)
+    if repair:
+        # ms.meshing_remove_t_vertices(method=0, threshold=40, repeat=True)
+        ms.meshing_repair_non_manifold_edges(method=0)
+        ms.meshing_repair_non_manifold_vertices(vertdispratio=0)
+    if remesh:
+        # ms.apply_coord_taubin_smoothing()
+        ms.meshing_isotropic_explicit_remeshing(
+            iterations=3, targetlen=pml.AbsoluteValue(remesh_size)
+        )
+    # extract mesh
+    m = ms.current_mesh()
+    verts = m.vertex_matrix()
+    faces = m.face_matrix()
+    print(
+        f"[INFO] mesh cleaning: {_ori_vert_shape} --> {verts.shape}, {_ori_face_shape} --> {faces.shape}"
+    )
+    return verts, faces

process.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import glob
+import sys
+import cv2
+import argparse
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+from PIL import Image
+import rembg
+class BLIP2():
+    def __init__(self, device='cuda'):
+        self.device = device
+        from transformers import AutoProcessor, Blip2ForConditionalGeneration
+        self.processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16).to(device)
+    @torch.no_grad()
+    def __call__(self, image):
+        image = Image.fromarray(image)
+        inputs = self.processor(image, return_tensors="pt").to(self.device, torch.float16)
+        generated_ids = self.model.generate(**inputs, max_new_tokens=20)
+        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        return generated_text
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('path', type=str, help="path to image (png, jpeg, etc.)")
+    parser.add_argument('--model', default='u2net', type=str, help="rembg model, see https://github.com/danielgatis/rembg#models")
+    parser.add_argument('--size', default=256, type=int, help="output resolution")
+    parser.add_argument('--border_ratio', default=0.2, type=float, help="output border ratio")
+    parser.add_argument('--recenter', type=bool, default=True, help="recenter, potentially not helpful for multiview zero123")
+    opt = parser.parse_args()
+    session = rembg.new_session(model_name=opt.model)
+    if os.path.isdir(opt.path):
+        print(f'[INFO] processing directory {opt.path}...')
+        files = glob.glob(f'{opt.path}/*')
+        out_dir = opt.path
+    else: # isfile
+        files = [opt.path]
+        out_dir = os.path.dirname(opt.path)
+    for file in files:
+        out_base = os.path.basename(file).split('.')[0]
+        out_rgba = os.path.join(out_dir, out_base + '_rgba.png')
+        # load image
+        print(f'[INFO] loading image {file}...')
+        image = cv2.imread(file, cv2.IMREAD_UNCHANGED)
+        # carve background
+        print(f'[INFO] background removal...')
+        carved_image = rembg.remove(image, session=session) # [H, W, 4]
+        mask = carved_image[..., -1] > 0
+        # recenter
+        if opt.recenter:
+            print(f'[INFO] recenter...')
+            final_rgba = np.zeros((opt.size, opt.size, 4), dtype=np.uint8)
+            coords = np.nonzero(mask)
+            x_min, x_max = coords[0].min(), coords[0].max()
+            y_min, y_max = coords[1].min(), coords[1].max()
+            h = x_max - x_min
+            w = y_max - y_min
+            desired_size = int(opt.size * (1 - opt.border_ratio))
+            scale = desired_size / max(h, w)
+            h2 = int(h * scale)
+            w2 = int(w * scale)
+            x2_min = (opt.size - h2) // 2
+            x2_max = x2_min + h2
+            y2_min = (opt.size - w2) // 2
+            y2_max = y2_min + w2
+            final_rgba[x2_min:x2_max, y2_min:y2_max] = cv2.resize(carved_image[x_min:x_max, y_min:y_max], (w2, h2), interpolation=cv2.INTER_AREA)
+        else:
+            final_rgba = carved_image
+        # write image
+        cv2.imwrite(out_rgba, final_rgba)

readme.md ADDED Viewed

	@@ -0,0 +1,139 @@

+# DreamGaussian
+This repository contains the official implementation for [DreamGaussian: Generative Gaussian Splatting for Efficient 3D Content Creation](https://arxiv.org/abs/2309.16653).
+### [Project Page](https://dreamgaussian.github.io) | [Arxiv](https://arxiv.org/abs/2309.16653)
+https://github.com/dreamgaussian/dreamgaussian/assets/25863658/db860801-7b9c-4b30-9eb9-87330175f5c8
+### [Colab demo](https://github.com/camenduru/dreamgaussian-colab)
+* Image-to-3D: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sLpYmmLS209-e5eHgcuqdryFRRO6ZhFS?usp=sharing)
+* Text-to-3D: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/dreamgaussian-colab/blob/main/dreamgaussian_colab.ipynb)
+### [Gradio demo](https://huggingface.co/spaces/jiawei011/dreamgaussian)
+* Image-to-3D: <a href="https://huggingface.co/spaces/jiawei011/dreamgaussian"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Gradio%20Demo-Huggingface-orange"></a>
+## Install
+```bash
+pip install -r requirements.txt
+# a modified gaussian splatting (+ depth, alpha rendering)
+git clone --recursive https://github.com/ashawkey/diff-gaussian-rasterization
+pip install ./diff-gaussian-rasterization
+# simple-knn
+pip install ./simple-knn
+# nvdiffrast
+pip install git+https://github.com/NVlabs/nvdiffrast/
+# kiuikit
+pip install git+https://github.com/ashawkey/kiuikit
+```
+Tested on:
+* Ubuntu 22 with torch 1.12 & CUDA 11.6 on a V100.
+* Windows 10 with torch 2.1 & CUDA 12.1 on a 3070.
+## Usage
+Image-to-3D:
+```bash
+### preprocess
+# background removal and recentering, save rgba at 256x256
+python process.py data/name.jpg
+# save at a larger resolution
+python process.py data/name.jpg --size 512
+# process all jpg images under a dir
+python process.py data
+### training gaussian stage
+# train 500 iters (~1min) and export ckpt & coarse_mesh to logs
+python main.py --config configs/image.yaml input=data/name_rgba.png save_path=name
+# gui mode (supports visualizing training)
+python main.py --config configs/image.yaml input=data/name_rgba.png save_path=name gui=True
+# load and visualize a saved ckpt
+python main.py --config configs/image.yaml load=logs/name_model.ply gui=True
+# use an estimated elevation angle if image is not front-view (e.g., common looking-down image can use -30)
+python main.py --config configs/image.yaml input=data/name_rgba.png save_path=name elevation=-30
+### training mesh stage
+# auto load coarse_mesh and refine 50 iters (~1min), export fine_mesh to logs
+python main2.py --config configs/image.yaml input=data/name_rgba.png save_path=name
+# specify coarse mesh path explicity
+python main2.py --config configs/image.yaml input=data/name_rgba.png save_path=name mesh=logs/name_mesh.obj
+# gui mode
+python main2.py --config configs/image.yaml input=data/name_rgba.png save_path=name gui=True
+# export glb instead of obj
+python main2.py --config configs/image.yaml input=data/name_rgba.png save_path=name mesh_format=glb
+### visualization
+# gui for visualizing mesh
+python -m kiui.render logs/name.obj
+# save 360 degree video of mesh (can run without gui)
+python -m kiui.render logs/name.obj --save_video name.mp4 --wogui
+# save 8 view images of mesh (can run without gui)
+python -m kiui.render logs/name.obj --save images/name/ --wogui
+### evaluation of CLIP-similarity
+python -m kiui.cli.clip_sim data/name_rgba.png logs/name.obj
+```
+Please check `./configs/image.yaml` for more options.
+Text-to-3D:
+```bash
+### training gaussian stage
+python main.py --config configs/text.yaml prompt="a photo of an icecream" save_path=icecream
+### training mesh stage
+python main2.py --config configs/text.yaml prompt="a photo of an icecream" save_path=icecream
+```
+Please check `./configs/text.yaml` for more options.
+Helper scripts:
+```bash
+# run all image samples (*_rgba.png) in ./data
+python scripts/runall.py --dir ./data --gpu 0
+# run all text samples (hardcoded in runall_sd.py)
+python scripts/runall_sd.py --gpu 0
+# export all ./logs/*.obj to mp4 in ./videos
+python scripts/convert_obj_to_video.py --dir ./logs
+```
+### Gradio Demo
+```bash
+python gradio_app.py
+```
+## Acknowledgement
+This work is built on many amazing research works and open-source projects, thanks a lot to all the authors for sharing!
+* [gaussian-splatting](https://github.com/graphdeco-inria/gaussian-splatting) and [diff-gaussian-rasterization](https://github.com/graphdeco-inria/diff-gaussian-rasterization)
+* [threestudio](https://github.com/threestudio-project/threestudio)
+* [nvdiffrast](https://github.com/NVlabs/nvdiffrast)
+* [dearpygui](https://github.com/hoffstadt/DearPyGui)
+## Citation
+```
+@article{tang2023dreamgaussian,
+  title={DreamGaussian: Generative Gaussian Splatting for Efficient 3D Content Creation},
+  author={Tang, Jiaxiang and Ren, Jiawei and Zhou, Hang and Liu, Ziwei and Zeng, Gang},
+  journal={arXiv preprint arXiv:2309.16653},
+  year={2023}
+}
+```

requirements.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+tqdm
+rich
+ninja
+numpy
+pandas
+scipy
+scikit-learn
+matplotlib
+opencv-python
+imageio
+imageio-ffmpeg
+omegaconf
+torch
+einops
+plyfile
+pygltflib
+# for gui
+dearpygui
+# for stable-diffusion
+huggingface_hub
+diffusers >= 0.9.0
+accelerate
+transformers
+# for dmtet and mesh export
+xatlas
+trimesh
+PyMCubes
+pymeshlab
+rembg[gpu,cli]
+# gradio demo
+gradio

scripts/convert_obj_to_video.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+import glob
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--dir', default='logs', type=str, help='Directory where obj files are stored')
+parser.add_argument('--out', default='videos', type=str, help='Directory where videos will be saved')
+args = parser.parse_args()
+out = args.out
+os.makedirs(out, exist_ok=True)
+files = glob.glob(f'{args.dir}/*.obj')
+for f in files:
+    name = os.path.basename(f)
+    # first stage model, ignore
+    if name.endswith('_mesh.obj'):
+        continue
+    print(f'[INFO] process {name}')
+    os.system(f"python -m kiui.render {f} --save_video {os.path.join(out, name.replace('.obj', '.mp4'))} ")

scripts/run.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+export CUDA_VISIBLE_DEVICES=5
+python main.py --config configs/image.yaml input=data/anya_rgba.png save_path=anya
+python main2.py --config configs/image.yaml input=data/anya_rgba.png save_path=anya
+python -m kiui.render logs/anya.obj --save_video videos/anya.mp4 --wogui

scripts/run_sd.sh ADDED Viewed

	@@ -0,0 +1,31 @@

+export CUDA_VISIBLE_DEVICES=6
+# easy samples
+python main.py --config configs/text.yaml prompt="a photo of an icecream" save_path=icecream
+python main2.py --config configs/text.yaml prompt="a photo of an icecream" save_path=icecream
+python main.py --config configs/text.yaml prompt="a ripe strawberry" save_path=strawberry
+python main2.py --config configs/text.yaml prompt="a ripe strawberry" save_path=strawberry
+python main.py --config configs/text.yaml prompt="a blue tulip" save_path=tulip
+python main2.py --config configs/text.yaml prompt="a blue tulip" save_path=tulip
+python main.py --config configs/text.yaml prompt="a golden goblet" save_path=goblet
+python main2.py --config configs/text.yaml prompt="a golden goblet" save_path=goblet
+python main.py --config configs/text.yaml prompt="a photo of a hamburger" save_path=hamburger
+python main2.py --config configs/text.yaml prompt="a photo of a hamburger" save_path=hamburger
+python main.py --config configs/text.yaml prompt="a delicious croissant" save_path=croissant
+python main2.py --config configs/text.yaml prompt="a delicious croissant" save_path=croissant
+# hard samples
+python main.py --config configs/text.yaml prompt="a baby bunny sitting on top of a stack of pancake" save_path=bunny_pancake
+python main2.py --config configs/text.yaml prompt="a baby bunny sitting on top of a stack of pancake" save_path=bunny_pancake
+python main.py --config configs/text.yaml prompt="a typewriter" save_path=typewriter
+python main2.py --config configs/text.yaml prompt="a typewriter" save_path=typewriter
+python main.py --config configs/text.yaml prompt="a pineapple" save_path=pineapple
+python main2.py --config configs/text.yaml prompt="a pineapple" save_path=pineapple
+python main.py --config configs/text.yaml prompt="a model of a house in Tudor style" save_path=tudor_house
+python main2.py --config configs/text.yaml prompt="a model of a house in Tudor style" save_path=tudor_house
+python main.py --config configs/text.yaml prompt="a lionfish" save_path=lionfish
+python main2.py --config configs/text.yaml prompt="a lionfish" save_path=lionfish
+python main.py --config configs/text.yaml prompt="a bunch of yellow rose, highly detailed" save_path=rose
+python main2.py --config configs/text.yaml prompt="a bunch of yellow rose, highly detailed" save_path=rose

scripts/runall.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+import glob
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--dir', default='data', type=str, help='Directory where processed images are stored')
+parser.add_argument('--out', default='logs', type=str, help='Directory where obj files will be saved')
+parser.add_argument('--video-out', default='videos', type=str, help='Directory where videos will be saved')
+parser.add_argument('--gpu', default=0, type=int, help='ID of GPU to use')
+parser.add_argument('--elevation', default=0, type=int, help='Elevation angle of view in degrees')
+parser.add_argument('--config', default='configs', type=str, help='Path to config directory, which contains image.yaml')
+args = parser.parse_args()
+files = glob.glob(f'{args.dir}/*_rgba.png')
+configs_dir = args.config
+# check if image.yaml exists
+if not os.path.exists(os.path.join(configs_dir, 'image.yaml')):
+    raise FileNotFoundError(
+        f'image.yaml not found in {configs_dir} directory. Please check if the directory is correct.'
+    )
+# create output directories if not exists
+out_dir = args.out
+os.makedirs(out_dir, exist_ok=True)
+video_dir = args.video_out
+os.makedirs(video_dir, exist_ok=True)
+for file in files:
+    name = os.path.basename(file).replace("_rgba.png", "")
+    print(f'======== processing {name} ========')
+    # first stage
+    os.system(f'CUDA_VISIBLE_DEVICES={args.gpu} python main.py '
+              f'--config {configs_dir}/image.yaml '
+              f'input={file} '
+              f'save_path={name} elevation={args.elevation}')
+    # second stage
+    os.system(f'CUDA_VISIBLE_DEVICES={args.gpu} python main2.py '
+              f'--config {configs_dir}/image.yaml '
+              f'input={file} '
+              f'save_path={name} elevation={args.elevation}')
+    # export video
+    mesh_path = os.path.join(out_dir, f'{name}.obj')
+    os.system(f'python -m kiui.render {mesh_path} '
+              f'--save_video {video_dir}/{name}.mp4 '
+              f'--wogui '
+              f'--elevation {args.elevation}')

scripts/runall_sd.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+import glob
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--gpu', default=0, type=int)
+args = parser.parse_args()
+prompts = [
+    ('strawberry', 'a ripe strawberry'),
+    ('cactus_pot', 'a small saguaro cactus planted in a clay pot'),
+    ('hamburger', 'a delicious hamburger'),
+    ('icecream', 'an icecream'),
+    ('tulip', 'a blue tulip'),
+    ('pineapple', 'a ripe pineapple'),
+    ('goblet', 'a golden goblet'),
+    # ('squitopus', 'a squirrel-octopus hybrid'),
+    # ('astronaut', 'Michelangelo style statue of an astronaut'),
+    # ('teddy_bear', 'a teddy bear'),
+    # ('corgi_nurse', 'a plush toy of a corgi nurse'),
+    # ('teapot', 'a blue and white porcelain teapot'),
+    # ('skull', "a human skull"),
+    # ('penguin', 'a penguin'),
+    # ('campfire', 'a campfire'),
+    # ('donut', 'a donut with pink icing'),
+    # ('cupcake', 'a birthday cupcake'),
+    # ('pie', 'shepherds pie'),
+    # ('cone', 'a traffic cone'),
+    # ('schoolbus', 'a schoolbus'),
+    # ('avocado_chair', 'a chair that looks like an avocado'),
+    # ('glasses', 'a pair of sunglasses')
+    # ('potion', 'a bottle of green potion'),
+    # ('chalice', 'a delicate chalice'),
+]
+for name, prompt in prompts:
+    print(f'======== processing {name} ========')
+    # first stage
+    os.system(f'CUDA_VISIBLE_DEVICES={args.gpu} python main.py --config configs/text.yaml prompt="{prompt}" save_path={name}')
+    # second stage
+    os.system(f'CUDA_VISIBLE_DEVICES={args.gpu} python main2.py --config configs/text.yaml  prompt="{prompt}" save_path={name}')
+    # export video
+    mesh_path = os.path.join('logs', f'{name}.obj')
+    os.makedirs('videos', exist_ok=True)
+    os.system(f'python -m kiui.render {mesh_path} --save_video videos/{name}.mp4 --wogui')

sh_utils.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#  Copyright 2021 The PlenOctree Authors.
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+import torch
+C0 = 0.28209479177387814
+C1 = 0.4886025119029199
+C2 = [
+    1.0925484305920792,
+    -1.0925484305920792,
+    0.31539156525252005,
+    -1.0925484305920792,
+    0.5462742152960396
+]
+C3 = [
+    -0.5900435899266435,
+    2.890611442640554,
+    -0.4570457994644658,
+    0.3731763325901154,
+    -0.4570457994644658,
+    1.445305721320277,
+    -0.5900435899266435
+]
+C4 = [
+    2.5033429417967046,
+    -1.7701307697799304,
+    0.9461746957575601,
+    -0.6690465435572892,
+    0.10578554691520431,
+    -0.6690465435572892,
+    0.47308734787878004,
+    -1.7701307697799304,
+    0.6258357354491761,
+]
+def eval_sh(deg, sh, dirs):
+    """
+    Evaluate spherical harmonics at unit directions
+    using hardcoded SH polynomials.
+    Works with torch/np/jnp.
+    ... Can be 0 or more batch dimensions.
+    Args:
+        deg: int SH deg. Currently, 0-3 supported
+        sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
+        dirs: jnp.ndarray unit directions [..., 3]
+    Returns:
+        [..., C]
+    """
+    assert deg <= 4 and deg >= 0
+    coeff = (deg + 1) ** 2
+    assert sh.shape[-1] >= coeff
+    result = C0 * sh[..., 0]
+    if deg > 0:
+        x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
+        result = (result -
+                C1 * y * sh[..., 1] +
+                C1 * z * sh[..., 2] -
+                C1 * x * sh[..., 3])
+        if deg > 1:
+            xx, yy, zz = x * x, y * y, z * z
+            xy, yz, xz = x * y, y * z, x * z
+            result = (result +
+                    C2[0] * xy * sh[..., 4] +
+                    C2[1] * yz * sh[..., 5] +
+                    C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] +
+                    C2[3] * xz * sh[..., 7] +
+                    C2[4] * (xx - yy) * sh[..., 8])
+            if deg > 2:
+                result = (result +
+                C3[0] * y * (3 * xx - yy) * sh[..., 9] +
+                C3[1] * xy * z * sh[..., 10] +
+                C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] +
+                C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] +
+                C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] +
+                C3[5] * z * (xx - yy) * sh[..., 14] +
+                C3[6] * x * (xx - 3 * yy) * sh[..., 15])
+                if deg > 3:
+                    result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] +
+                            C4[1] * yz * (3 * xx - yy) * sh[..., 17] +
+                            C4[2] * xy * (7 * zz - 1) * sh[..., 18] +
+                            C4[3] * yz * (7 * zz - 3) * sh[..., 19] +
+                            C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] +
+                            C4[5] * xz * (7 * zz - 3) * sh[..., 21] +
+                            C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] +
+                            C4[7] * xz * (xx - 3 * yy) * sh[..., 23] +
+                            C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24])
+    return result
+def RGB2SH(rgb):
+    return (rgb - 0.5) / C0
+def SH2RGB(sh):
+    return sh * C0 + 0.5

simple-knn/ext.cpp ADDED Viewed

	@@ -0,0 +1,17 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  [email protected]
+ */
+#include <torch/extension.h>
+#include "spatial.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("distCUDA2", &distCUDA2);
+}

simple-knn/setup.py ADDED Viewed

	@@ -0,0 +1,35 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+from setuptools import setup
+from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+import os
+cxx_compiler_flags = []
+if os.name == 'nt':
+    cxx_compiler_flags.append("/wd4624")
+setup(
+    name="simple_knn",
+    ext_modules=[
+        CUDAExtension(
+            name="simple_knn._C",
+            sources=[
+            "spatial.cu",
+            "simple_knn.cu",
+            "ext.cpp"],
+            extra_compile_args={"nvcc": [], "cxx": cxx_compiler_flags})
+        ],
+    cmdclass={
+        'build_ext': BuildExtension
+    }
+)

simple-knn/simple_knn.cu ADDED Viewed

	@@ -0,0 +1,221 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  [email protected]
+ */
+#define BOX_SIZE 1024
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "simple_knn.h"
+#include <cub/cub.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <vector>
+#include <cuda_runtime_api.h>
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+#define __CUDACC__
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+struct CustomMin
+{
+	__device__ __forceinline__
+		float3 operator()(const float3& a, const float3& b) const {
+		return { min(a.x, b.x), min(a.y, b.y), min(a.z, b.z) };
+	}
+};
+struct CustomMax
+{
+	__device__ __forceinline__
+		float3 operator()(const float3& a, const float3& b) const {
+		return { max(a.x, b.x), max(a.y, b.y), max(a.z, b.z) };
+	}
+};
+__host__ __device__ uint32_t prepMorton(uint32_t x)
+{
+	x = (x | (x << 16)) & 0x030000FF;
+	x = (x | (x << 8)) & 0x0300F00F;
+	x = (x | (x << 4)) & 0x030C30C3;
+	x = (x | (x << 2)) & 0x09249249;
+	return x;
+}
+__host__ __device__ uint32_t coord2Morton(float3 coord, float3 minn, float3 maxx)
+{
+	uint32_t x = prepMorton(((coord.x - minn.x) / (maxx.x - minn.x)) * ((1 << 10) - 1));
+	uint32_t y = prepMorton(((coord.y - minn.y) / (maxx.y - minn.y)) * ((1 << 10) - 1));
+	uint32_t z = prepMorton(((coord.z - minn.z) / (maxx.z - minn.z)) * ((1 << 10) - 1));
+	return x | (y << 1) | (z << 2);
+}
+__global__ void coord2Morton(int P, const float3* points, float3 minn, float3 maxx, uint32_t* codes)
+{
+	auto idx = cg::this_grid().thread_rank();
+	if (idx >= P)
+		return;
+	codes[idx] = coord2Morton(points[idx], minn, maxx);
+}
+struct MinMax
+{
+	float3 minn;
+	float3 maxx;
+};
+__global__ void boxMinMax(uint32_t P, float3* points, uint32_t* indices, MinMax* boxes)
+{
+	auto idx = cg::this_grid().thread_rank();
+	MinMax me;
+	if (idx < P)
+	{
+		me.minn = points[indices[idx]];
+		me.maxx = points[indices[idx]];
+	}
+	else
+	{
+		me.minn = { FLT_MAX, FLT_MAX, FLT_MAX };
+		me.maxx = { -FLT_MAX,-FLT_MAX,-FLT_MAX };
+	}
+	__shared__ MinMax redResult[BOX_SIZE];
+	for (int off = BOX_SIZE / 2; off >= 1; off /= 2)
+	{
+		if (threadIdx.x < 2 * off)
+			redResult[threadIdx.x] = me;
+		__syncthreads();
+		if (threadIdx.x < off)
+		{
+			MinMax other = redResult[threadIdx.x + off];
+			me.minn.x = min(me.minn.x, other.minn.x);
+			me.minn.y = min(me.minn.y, other.minn.y);
+			me.minn.z = min(me.minn.z, other.minn.z);
+			me.maxx.x = max(me.maxx.x, other.maxx.x);
+			me.maxx.y = max(me.maxx.y, other.maxx.y);
+			me.maxx.z = max(me.maxx.z, other.maxx.z);
+		}
+		__syncthreads();
+	}
+	if (threadIdx.x == 0)
+		boxes[blockIdx.x] = me;
+}
+__device__ __host__ float distBoxPoint(const MinMax& box, const float3& p)
+{
+	float3 diff = { 0, 0, 0 };
+	if (p.x < box.minn.x || p.x > box.maxx.x)
+		diff.x = min(abs(p.x - box.minn.x), abs(p.x - box.maxx.x));
+	if (p.y < box.minn.y || p.y > box.maxx.y)
+		diff.y = min(abs(p.y - box.minn.y), abs(p.y - box.maxx.y));
+	if (p.z < box.minn.z || p.z > box.maxx.z)
+		diff.z = min(abs(p.z - box.minn.z), abs(p.z - box.maxx.z));
+	return diff.x * diff.x + diff.y * diff.y + diff.z * diff.z;
+}
+template<int K>
+__device__ void updateKBest(const float3& ref, const float3& point, float* knn)
+{
+	float3 d = { point.x - ref.x, point.y - ref.y, point.z - ref.z };
+	float dist = d.x * d.x + d.y * d.y + d.z * d.z;
+	for (int j = 0; j < K; j++)
+	{
+		if (knn[j] > dist)
+		{
+			float t = knn[j];
+			knn[j] = dist;
+			dist = t;
+		}
+	}
+}
+__global__ void boxMeanDist(uint32_t P, float3* points, uint32_t* indices, MinMax* boxes, float* dists)
+{
+	int idx = cg::this_grid().thread_rank();
+	if (idx >= P)
+		return;
+	float3 point = points[indices[idx]];
+	float best[3] = { FLT_MAX, FLT_MAX, FLT_MAX };
+	for (int i = max(0, idx - 3); i <= min(P - 1, idx + 3); i++)
+	{
+		if (i == idx)
+			continue;
+		updateKBest<3>(point, points[indices[i]], best);
+	}
+	float reject = best[2];
+	best[0] = FLT_MAX;
+	best[1] = FLT_MAX;
+	best[2] = FLT_MAX;
+	for (int b = 0; b < (P + BOX_SIZE - 1) / BOX_SIZE; b++)
+	{
+		MinMax box = boxes[b];
+		float dist = distBoxPoint(box, point);
+		if (dist > reject || dist > best[2])
+			continue;
+		for (int i = b * BOX_SIZE; i < min(P, (b + 1) * BOX_SIZE); i++)
+		{
+			if (i == idx)
+				continue;
+			updateKBest<3>(point, points[indices[i]], best);
+		}
+	}
+	dists[indices[idx]] = (best[0] + best[1] + best[2]) / 3.0f;
+}
+void SimpleKNN::knn(int P, float3* points, float* meanDists)
+{
+	float3* result;
+	cudaMalloc(&result, sizeof(float3));
+	size_t temp_storage_bytes;
+	float3 init = { 0, 0, 0 }, minn, maxx;
+	cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, points, result, P, CustomMin(), init);
+	thrust::device_vector<char> temp_storage(temp_storage_bytes);
+	cub::DeviceReduce::Reduce(temp_storage.data().get(), temp_storage_bytes, points, result, P, CustomMin(), init);
+	cudaMemcpy(&minn, result, sizeof(float3), cudaMemcpyDeviceToHost);
+	cub::DeviceReduce::Reduce(temp_storage.data().get(), temp_storage_bytes, points, result, P, CustomMax(), init);
+	cudaMemcpy(&maxx, result, sizeof(float3), cudaMemcpyDeviceToHost);
+	thrust::device_vector<uint32_t> morton(P);
+	thrust::device_vector<uint32_t> morton_sorted(P);
+	coord2Morton << <(P + 255) / 256, 256 >> > (P, points, minn, maxx, morton.data().get());
+	thrust::device_vector<uint32_t> indices(P);
+	thrust::sequence(indices.begin(), indices.end());
+	thrust::device_vector<uint32_t> indices_sorted(P);
+	cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, morton.data().get(), morton_sorted.data().get(), indices.data().get(), indices_sorted.data().get(), P);
+	temp_storage.resize(temp_storage_bytes);
+	cub::DeviceRadixSort::SortPairs(temp_storage.data().get(), temp_storage_bytes, morton.data().get(), morton_sorted.data().get(), indices.data().get(), indices_sorted.data().get(), P);
+	uint32_t num_boxes = (P + BOX_SIZE - 1) / BOX_SIZE;
+	thrust::device_vector<MinMax> boxes(num_boxes);
+	boxMinMax << <num_boxes, BOX_SIZE >> > (P, points, indices_sorted.data().get(), boxes.data().get());
+	boxMeanDist << <num_boxes, BOX_SIZE >> > (P, points, indices_sorted.data().get(), boxes.data().get(), meanDists);
+	cudaFree(result);
+}

simple-knn/simple_knn.h ADDED Viewed

	@@ -0,0 +1,21 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  [email protected]
+ */
+#ifndef SIMPLEKNN_H_INCLUDED
+#define SIMPLEKNN_H_INCLUDED
+class SimpleKNN
+{
+public:
+	static void knn(int P, float3* points, float* meanDists);
+};
+#endif

simple-knn/simple_knn/.gitkeep ADDED Viewed

File without changes

simple-knn/spatial.cu ADDED Viewed

	@@ -0,0 +1,26 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  [email protected]
+ */
+#include "spatial.h"
+#include "simple_knn.h"
+torch::Tensor
+distCUDA2(const torch::Tensor& points)
+{
+  const int P = points.size(0);
+  auto float_opts = points.options().dtype(torch::kFloat32);
+  torch::Tensor means = torch::full({P}, 0.0, float_opts);
+  SimpleKNN::knn(P, (float3*)points.contiguous().data<float>(), means.contiguous().data<float>());
+  return means;
+}

simple-knn/spatial.h ADDED Viewed

	@@ -0,0 +1,14 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  [email protected]
+ */
+#include <torch/extension.h>
+torch::Tensor distCUDA2(const torch::Tensor& points);

zero123.py ADDED Viewed

	@@ -0,0 +1,666 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import math
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Union
+import PIL
+import torch
+import torchvision.transforms.functional as TF
+from diffusers.configuration_utils import ConfigMixin, FrozenDict, register_to_config
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate, is_accelerate_available, logging
+from diffusers.utils.torch_utils import randn_tensor
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class CLIPCameraProjection(ModelMixin, ConfigMixin):
+    """
+    A Projection layer for CLIP embedding and camera embedding.
+    Parameters:
+        embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `clip_embed`
+        additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
+            projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
+            additional_embeddings`.
+    """
+    @register_to_config
+    def __init__(self, embedding_dim: int = 768, additional_embeddings: int = 4):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.additional_embeddings = additional_embeddings
+        self.input_dim = self.embedding_dim + self.additional_embeddings
+        self.output_dim = self.embedding_dim
+        self.proj = torch.nn.Linear(self.input_dim, self.output_dim)
+    def forward(
+        self,
+        embedding: torch.FloatTensor,
+    ):
+        """
+        The [`PriorTransformer`] forward method.
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, input_dim)`):
+                The currently input embeddings.
+        Returns:
+            The output embedding projection (`torch.FloatTensor` of shape `(batch_size, output_dim)`).
+        """
+        proj_embedding = self.proj(embedding)
+        return proj_embedding
+class Zero123Pipeline(DiffusionPipeline):
+    r"""
+    Pipeline to generate variations from an input image using Stable Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder. Stable Diffusion Image Variation uses the vision portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    # TODO: feature_extractor is required to encode images (if they are in PIL format),
+    # we should give a descriptive message if the pipeline doesn't have one.
+    _optional_components = ["safety_checker"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        clip_camera_projection: CLIPCameraProjection,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        if safety_checker is None and requires_safety_checker:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        is_unet_version_less_0_9_0 = hasattr(
+            unet.config, "_diffusers_version"
+        ) and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse(
+            "0.9.0.dev0"
+        )
+        is_unet_sample_size_less_64 = (
+            hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        )
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate(
+                "sample_size<64", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            clip_camera_projection=clip_camera_projection,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+        device = torch.device(f"cuda:{gpu_id}")
+        for cpu_offloaded_model in [
+            self.unet,
+            self.image_encoder,
+            self.vae,
+            self.safety_checker,
+        ]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _encode_image(
+        self,
+        image,
+        elevation,
+        azimuth,
+        distance,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        clip_image_embeddings=None,
+        image_camera_embeddings=None,
+    ):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if image_camera_embeddings is None:
+            if image is None:
+                assert clip_image_embeddings is not None
+                image_embeddings = clip_image_embeddings.to(device=device, dtype=dtype)
+            else:
+                if not isinstance(image, torch.Tensor):
+                    image = self.feature_extractor(
+                        images=image, return_tensors="pt"
+                    ).pixel_values
+                image = image.to(device=device, dtype=dtype)
+                image_embeddings = self.image_encoder(image).image_embeds
+                image_embeddings = image_embeddings.unsqueeze(1)
+            bs_embed, seq_len, _ = image_embeddings.shape
+            if isinstance(elevation, float):
+                elevation = torch.as_tensor(
+                    [elevation] * bs_embed, dtype=dtype, device=device
+                )
+            if isinstance(azimuth, float):
+                azimuth = torch.as_tensor(
+                    [azimuth] * bs_embed, dtype=dtype, device=device
+                )
+            if isinstance(distance, float):
+                distance = torch.as_tensor(
+                    [distance] * bs_embed, dtype=dtype, device=device
+                )
+            camera_embeddings = torch.stack(
+                [
+                    torch.deg2rad(elevation),
+                    torch.sin(torch.deg2rad(azimuth)),
+                    torch.cos(torch.deg2rad(azimuth)),
+                    distance,
+                ],
+                dim=-1,
+            )[:, None, :]
+            image_embeddings = torch.cat([image_embeddings, camera_embeddings], dim=-1)
+            # project (image, camera) embeddings to the same dimension as clip embeddings
+            image_embeddings = self.clip_camera_projection(image_embeddings)
+        else:
+            image_embeddings = image_camera_embeddings.to(device=device, dtype=dtype)
+            bs_embed, seq_len, _ = image_embeddings.shape
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+        return image_embeddings
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(
+                    image, output_type="pil"
+                )
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(
+                feature_extractor_input, return_tensors="pt"
+            ).to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        warnings.warn(
+            "The decode_latents method is deprecated and will be removed in a future version. Please"
+            " use VaeImageProcessor instead",
+            FutureWarning,
+        )
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, image, height, width, callback_steps):
+        # TODO: check image size or adjust image size to (height, width)
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+        if (callback_steps is None) or (
+            callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(
+                shape, generator=generator, device=device, dtype=dtype
+            )
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def _get_latent_model_input(
+        self,
+        latents: torch.FloatTensor,
+        image: Optional[
+            Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor]
+        ],
+        num_images_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        image_latents: Optional[torch.FloatTensor] = None,
+    ):
+        if isinstance(image, PIL.Image.Image):
+            image_pt = TF.to_tensor(image).unsqueeze(0).to(latents)
+        elif isinstance(image, list):
+            image_pt = torch.stack([TF.to_tensor(img) for img in image], dim=0).to(
+                latents
+            )
+        elif isinstance(image, torch.Tensor):
+            image_pt = image
+        else:
+            image_pt = None
+        if image_pt is None:
+            assert image_latents is not None
+            image_pt = image_latents.repeat_interleave(num_images_per_prompt, dim=0)
+        else:
+            image_pt = image_pt * 2.0 - 1.0  # scale to [-1, 1]
+            # FIXME: encoded latents should be multiplied with self.vae.config.scaling_factor
+            # but zero123 was not trained this way
+            image_pt = self.vae.encode(image_pt).latent_dist.mode()
+            image_pt = image_pt.repeat_interleave(num_images_per_prompt, dim=0)
+        if do_classifier_free_guidance:
+            latent_model_input = torch.cat(
+                [
+                    torch.cat([latents, latents], dim=0),
+                    torch.cat([torch.zeros_like(image_pt), image_pt], dim=0),
+                ],
+                dim=1,
+            )
+        else:
+            latent_model_input = torch.cat([latents, image_pt], dim=1)
+        return latent_model_input
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Optional[
+            Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor]
+        ] = None,
+        elevation: Optional[Union[float, torch.FloatTensor]] = None,
+        azimuth: Optional[Union[float, torch.FloatTensor]] = None,
+        distance: Optional[Union[float, torch.FloatTensor]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 3.0,
+        num_images_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        clip_image_embeddings: Optional[torch.FloatTensor] = None,
+        image_camera_embeddings: Optional[torch.FloatTensor] = None,
+        image_latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The image or images to guide the image generation. If you provide a tensor, it needs to comply with the
+                configuration of
+                [this](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json)
+                `CLIPImageProcessor`
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        # TODO: check input elevation, azimuth, and distance
+        # TODO: check image, clip_image_embeddings, image_latents
+        self.check_inputs(image, height, width, callback_steps)
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            batch_size = image.shape[0]
+        else:
+            assert image_latents is not None
+            assert (
+                clip_image_embeddings is not None or image_camera_embeddings is not None
+            )
+            batch_size = image_latents.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input image
+        if isinstance(image, PIL.Image.Image) or isinstance(image, list):
+            pil_image = image
+        elif isinstance(image, torch.Tensor):
+            pil_image = [TF.to_pil_image(image[i]) for i in range(image.shape[0])]
+        else:
+            pil_image = None
+        image_embeddings = self._encode_image(
+            pil_image,
+            elevation,
+            azimuth,
+            distance,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            clip_image_embeddings,
+            image_camera_embeddings,
+        )
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        # num_channels_latents = self.unet.config.in_channels
+        num_channels_latents = 4  # FIXME: hard-coded
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = self._get_latent_model_input(
+                    latents,
+                    image,
+                    num_images_per_prompt,
+                    do_classifier_free_guidance,
+                    image_latents,
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=image_embeddings,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs
+                ).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor, return_dict=False
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, image_embeddings.dtype
+            )
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )