Spaces:
Running
on
Zero
Running
on
Zero
Stanislaw Szymanowicz
commited on
Commit
•
e10da38
1
Parent(s):
4aa5114
Add model and app file
Browse files- .gitignore +1 -0
- app.py +178 -4
- model_file/objaverse/.hydra/config.yaml +66 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*/__pycache__
|
app.py
CHANGED
@@ -1,7 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
iface.launch()
|
|
|
1 |
+
import torch
|
2 |
+
import torchvision
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
import os
|
6 |
+
from omegaconf import OmegaConf
|
7 |
+
from PIL import Image
|
8 |
+
|
9 |
+
from utils.app_utils import (
|
10 |
+
remove_background,
|
11 |
+
resize_foreground,
|
12 |
+
set_white_background,
|
13 |
+
resize_to_128,
|
14 |
+
to_tensor,
|
15 |
+
get_source_camera_v2w_rmo_and_quats,
|
16 |
+
get_target_cameras,
|
17 |
+
export_to_obj)
|
18 |
+
|
19 |
+
import imageio
|
20 |
+
|
21 |
+
from scene.gaussian_predictor import GaussianSplatPredictor
|
22 |
+
from gaussian_renderer import render_predicted
|
23 |
+
|
24 |
import gradio as gr
|
25 |
|
26 |
+
import rembg
|
27 |
+
|
28 |
+
def main():
|
29 |
+
|
30 |
+
# ============= model loading ==========
|
31 |
+
def load_model(device):
|
32 |
+
experiment_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
33 |
+
"model_file", "objaverse")
|
34 |
+
# load cfg
|
35 |
+
training_cfg = OmegaConf.load(os.path.join(experiment_path, ".hydra", "config.yaml"))
|
36 |
+
# load model
|
37 |
+
model = GaussianSplatPredictor(training_cfg)
|
38 |
+
ckpt_loaded = torch.load(os.path.join(experiment_path, "model_latest.pth"), map_location=device)
|
39 |
+
model.load_state_dict(ckpt_loaded["model_state_dict"])
|
40 |
+
return model, training_cfg
|
41 |
+
|
42 |
+
if torch.cuda.is_available():
|
43 |
+
device = "cuda:0"
|
44 |
+
else:
|
45 |
+
device = "cpu"
|
46 |
+
torch.cuda.set_device(device)
|
47 |
+
|
48 |
+
model, model_cfg = load_model(device)
|
49 |
+
model.to(device)
|
50 |
+
|
51 |
+
# ============= image preprocessing =============
|
52 |
+
rembg_session = rembg.new_session()
|
53 |
+
|
54 |
+
def check_input_image(input_image):
|
55 |
+
if input_image is None:
|
56 |
+
raise gr.Error("No image uploaded!")
|
57 |
+
|
58 |
+
def preprocess(input_image, preprocess_background=True, foreground_ratio=0.65):
|
59 |
+
# 0.7 seems to be a reasonable foreground ratio
|
60 |
+
if preprocess_background:
|
61 |
+
image = input_image.convert("RGB")
|
62 |
+
image = remove_background(image, rembg_session)
|
63 |
+
image = resize_foreground(image, foreground_ratio)
|
64 |
+
image = set_white_background(image)
|
65 |
+
else:
|
66 |
+
image = input_image
|
67 |
+
if image.mode == "RGBA":
|
68 |
+
image = set_white_background(image)
|
69 |
+
image = resize_to_128(image)
|
70 |
+
return image
|
71 |
+
|
72 |
+
ply_out_path="/users/stan/splatter-image/gradio_out/mesh.ply"
|
73 |
+
os.makedirs(os.path.dirname(ply_out_path), exist_ok=True)
|
74 |
+
|
75 |
+
def reconstruct_and_export(image):
|
76 |
+
"""
|
77 |
+
Passes image through model, outputs reconstruction in form of a dict of tensors.
|
78 |
+
"""
|
79 |
+
image = to_tensor(image).to(device)
|
80 |
+
view_to_world_source, rot_transform_quats = get_source_camera_v2w_rmo_and_quats()
|
81 |
+
view_to_world_source = view_to_world_source.to(device)
|
82 |
+
rot_transform_quats = rot_transform_quats.to(device)
|
83 |
+
|
84 |
+
reconstruction_unactivated = model(
|
85 |
+
image.unsqueeze(0).unsqueeze(0),
|
86 |
+
view_to_world_source,
|
87 |
+
rot_transform_quats,
|
88 |
+
None,
|
89 |
+
activate_output=False)
|
90 |
+
|
91 |
+
reconstruction = {k: v[0].contiguous() for k, v in reconstruction_unactivated.items()}
|
92 |
+
reconstruction["scaling"] = model.scaling_activation(reconstruction["scaling"])
|
93 |
+
reconstruction["opacity"] = model.opacity_activation(reconstruction["opacity"])
|
94 |
+
|
95 |
+
# render images in a loop
|
96 |
+
world_view_transforms, full_proj_transforms, camera_centers = get_target_cameras()
|
97 |
+
background = torch.tensor([1, 1, 1] , dtype=torch.float32, device=device)
|
98 |
+
loop_renders = []
|
99 |
+
t_to_512 = torchvision.transforms.Resize(512, interpolation=torchvision.transforms.InterpolationMode.NEAREST)
|
100 |
+
for r_idx in range( world_view_transforms.shape[0]):
|
101 |
+
image = render_predicted(reconstruction,
|
102 |
+
world_view_transforms[r_idx].to(device),
|
103 |
+
full_proj_transforms[r_idx].to(device),
|
104 |
+
camera_centers[r_idx].to(device),
|
105 |
+
background,
|
106 |
+
model_cfg,
|
107 |
+
focals_pixels=None)["render"]
|
108 |
+
image = t_to_512(image)
|
109 |
+
loop_renders.append(torch.clamp(image * 255, 0.0, 255.0).detach().permute(1, 2, 0).cpu().numpy().astype(np.uint8))
|
110 |
+
loop_out_path = os.path.join(os.path.dirname(ply_out_path), "loop.mp4")
|
111 |
+
imageio.mimsave(loop_out_path, loop_renders, fps=25)
|
112 |
+
# export reconstruction to ply
|
113 |
+
export_to_obj(reconstruction_unactivated, ply_out_path)
|
114 |
+
|
115 |
+
return loop_out_path, ply_out_path
|
116 |
+
|
117 |
+
with gr.Blocks() as demo:
|
118 |
+
gr.Markdown(
|
119 |
+
"""
|
120 |
+
|
121 |
+
# Splatter Image Demo
|
122 |
+
[Splatter Image](https://github.com/szymanowiczs/splatter-image) (CVPR 2024) is a fast, super cheap to train method for object 3D reconstruction from a single image.
|
123 |
+
The model used in the demo was trained on **Objaverse-LVIS on 2 A6000 GPUs for 3.5 days**.
|
124 |
+
On NVIDIA V100 GPU, reconstruction can be done at 38FPS and rendering at 588FPS.
|
125 |
+
Upload an image of an object to see how the Splatter Image does.
|
126 |
+
|
127 |
+
**Comments:**
|
128 |
+
1. The first example you upload should take about 4.5 seconds (with preprocessing, saving and overhead), the following take about 1.5s.
|
129 |
+
2. The model does not work well on photos of humans.
|
130 |
+
3. The 3D viewer shows a .ply mesh extracted from a mix of 3D Gaussians. Artefacts might show - see video for more faithful results.
|
131 |
+
4. Best results are achieved on the datasets described in the [repository](https://github.com/szymanowiczs/splatter-image) using that code. This demo is experimental.
|
132 |
+
5. Our model might not be better than some state-of-the-art methods, but it is of comparable quality and is **much** cheaper to train and run.
|
133 |
+
"""
|
134 |
+
)
|
135 |
+
with gr.Row(variant="panel"):
|
136 |
+
with gr.Column():
|
137 |
+
with gr.Row():
|
138 |
+
input_image = gr.Image(
|
139 |
+
label="Input Image",
|
140 |
+
image_mode="RGBA",
|
141 |
+
sources="upload",
|
142 |
+
type="pil",
|
143 |
+
elem_id="content_image",
|
144 |
+
)
|
145 |
+
processed_image = gr.Image(label="Processed Image", interactive=False)
|
146 |
+
with gr.Row():
|
147 |
+
with gr.Group():
|
148 |
+
preprocess_background = gr.Checkbox(
|
149 |
+
label="Remove Background", value=True
|
150 |
+
)
|
151 |
+
with gr.Row():
|
152 |
+
submit = gr.Button("Generate", elem_id="generate", variant="primary")
|
153 |
+
with gr.Column():
|
154 |
+
with gr.Row():
|
155 |
+
with gr.Tab("Reconstruction"):
|
156 |
+
with gr.Column():
|
157 |
+
output_video = gr.Video(value=None, width=512, label="Rendered Video", autoplay=True)
|
158 |
+
output_model = gr.Model3D(
|
159 |
+
height=512,
|
160 |
+
label="Output Model",
|
161 |
+
interactive=False
|
162 |
+
)
|
163 |
+
|
164 |
+
submit.click(fn=check_input_image, inputs=[input_image]).success(
|
165 |
+
fn=preprocess,
|
166 |
+
inputs=[input_image, preprocess_background],
|
167 |
+
outputs=[processed_image],
|
168 |
+
).success(
|
169 |
+
fn=reconstruct_and_export,
|
170 |
+
inputs=[processed_image],
|
171 |
+
outputs=[output_video, output_model],
|
172 |
+
)
|
173 |
+
|
174 |
+
demo.queue(max_size=1)
|
175 |
+
demo.launch()
|
176 |
+
|
177 |
+
|
178 |
+
if __name__ == "__main__":
|
179 |
+
main()
|
180 |
|
181 |
+
# gradio app interface
|
|
model_file/objaverse/.hydra/config.yaml
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wandb:
|
2 |
+
project: gs_pred
|
3 |
+
cam_embd:
|
4 |
+
embedding: null
|
5 |
+
encode_embedding: null
|
6 |
+
dimension: 0
|
7 |
+
method: null
|
8 |
+
general:
|
9 |
+
device: 0
|
10 |
+
random_seed: 0
|
11 |
+
num_devices: 2
|
12 |
+
mixed_precision: true
|
13 |
+
data:
|
14 |
+
training_resolution: 128
|
15 |
+
fov: 49.134342641202636
|
16 |
+
subset: -1
|
17 |
+
input_images: 1
|
18 |
+
znear: 0.8
|
19 |
+
zfar: 3.2
|
20 |
+
category: objaverse
|
21 |
+
white_background: true
|
22 |
+
origin_distances: false
|
23 |
+
opt:
|
24 |
+
iterations: 50001
|
25 |
+
base_lr: 6.34584421e-05
|
26 |
+
batch_size: 16
|
27 |
+
betas:
|
28 |
+
- 0.9
|
29 |
+
- 0.999
|
30 |
+
loss: l2
|
31 |
+
imgs_per_obj: 4
|
32 |
+
ema:
|
33 |
+
use: true
|
34 |
+
update_every: 10
|
35 |
+
update_after_step: 100
|
36 |
+
beta: 0.9999
|
37 |
+
lambda_lpips: 0.33814373
|
38 |
+
start_lpips_after: 0
|
39 |
+
step_lr_at: -1
|
40 |
+
model:
|
41 |
+
max_sh_degree: 1
|
42 |
+
inverted_x: false
|
43 |
+
inverted_y: true
|
44 |
+
name: SingleUNet
|
45 |
+
opacity_scale: 1.0
|
46 |
+
opacity_bias: -2.0
|
47 |
+
scale_scale: 0.01
|
48 |
+
scale_bias: 0.02
|
49 |
+
xyz_scale: 0.1
|
50 |
+
xyz_bias: 0.0
|
51 |
+
depth_scale: 1.0
|
52 |
+
depth_bias: 0.0
|
53 |
+
network_without_offset: false
|
54 |
+
network_with_offset: true
|
55 |
+
attention_resolutions:
|
56 |
+
- 16
|
57 |
+
cross_view_attention: true
|
58 |
+
isotropic: false
|
59 |
+
base_dim: 128
|
60 |
+
num_blocks: 4
|
61 |
+
logging:
|
62 |
+
ckpt_iterations: 1000
|
63 |
+
val_log: 10000
|
64 |
+
loss_log: 10
|
65 |
+
loop_log: 10000
|
66 |
+
render_log: 10000
|