hpwang commited on
Commit
fd5e0f7
1 Parent(s): 04a2f88
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +106 -0
  2. ops/__init__.py +0 -0
  3. ops/connect.py +113 -0
  4. ops/depth_pro.py +13 -0
  5. ops/eval.py +51 -0
  6. ops/fooocus.py +6 -0
  7. ops/gs/__init__.py +0 -0
  8. ops/gs/basic.py +296 -0
  9. ops/gs/sh_utils.py +96 -0
  10. ops/gs/train.py +92 -0
  11. ops/llava.py +31 -0
  12. ops/mcs.py +121 -0
  13. ops/sky.py +22 -0
  14. ops/trajs/__init__.py +53 -0
  15. ops/trajs/spiral.py +46 -0
  16. ops/utils.py +381 -0
  17. ops/visual_check.py +64 -0
  18. pipe/__init__.py +0 -0
  19. pipe/c2f_recons.py +211 -0
  20. pipe/cfgs/INSTRUCT.md +18 -0
  21. pipe/cfgs/__init__.py +8 -0
  22. pipe/cfgs/basic.yaml +47 -0
  23. pipe/lvm_inpaint.py +85 -0
  24. pipe/reconstruct.py +52 -0
  25. pipe/refine_mvdps.py +155 -0
  26. requirements.txt +28 -0
  27. tools/DepthPro/ACKNOWLEDGEMENTS.md +418 -0
  28. tools/DepthPro/CODE_OF_CONDUCT.md +71 -0
  29. tools/DepthPro/CONTRIBUTING.md +11 -0
  30. tools/DepthPro/LICENSE +47 -0
  31. tools/DepthPro/README.md +97 -0
  32. tools/DepthPro/command_pro_dpt.py +54 -0
  33. tools/DepthPro/get_pretrained_models.sh +8 -0
  34. tools/DepthPro/pyproject.toml +59 -0
  35. tools/DepthPro/src/depth_pro/__init__.py +5 -0
  36. tools/DepthPro/src/depth_pro/cli/__init__.py +4 -0
  37. tools/DepthPro/src/depth_pro/cli/run.py +154 -0
  38. tools/DepthPro/src/depth_pro/depth_pro.py +298 -0
  39. tools/DepthPro/src/depth_pro/eval/boundary_metrics.py +332 -0
  40. tools/DepthPro/src/depth_pro/eval/dis5k_sample_list.txt +200 -0
  41. tools/DepthPro/src/depth_pro/network/__init__.py +2 -0
  42. tools/DepthPro/src/depth_pro/network/__pycache__/__init__.cpython-310.pyc +0 -0
  43. tools/DepthPro/src/depth_pro/network/__pycache__/decoder.cpython-310.pyc +0 -0
  44. tools/DepthPro/src/depth_pro/network/__pycache__/encoder.cpython-310.pyc +0 -0
  45. tools/DepthPro/src/depth_pro/network/__pycache__/fov.cpython-310.pyc +0 -0
  46. tools/DepthPro/src/depth_pro/network/__pycache__/vit.cpython-310.pyc +0 -0
  47. tools/DepthPro/src/depth_pro/network/__pycache__/vit_factory.cpython-310.pyc +0 -0
  48. tools/DepthPro/src/depth_pro/network/decoder.py +206 -0
  49. tools/DepthPro/src/depth_pro/network/encoder.py +332 -0
  50. tools/DepthPro/src/depth_pro/network/fov.py +82 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gradio as gr
4
+ from PIL import Image
5
+ from pipe.cfgs import load_cfg
6
+ from pipe.c2f_recons import Pipeline
7
+ from ops.gs.basic import Gaussian_Scene
8
+ from datetime import datetime
9
+
10
+ cfg = load_cfg(f'pipe/cfgs/basic.yaml')
11
+ vistadream = Pipeline(cfg)
12
+
13
+ from ops.visual_check import Check
14
+ checkor = Check()
15
+
16
+ def get_temp_path():
17
+ if not os.path.exists('data/gradio_temp'):os.makedirs('data/gradio_temp')
18
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
19
+ output_path = f"data/gradio_temp/{timestamp}/"
20
+ return output_path
21
+
22
+ def scene_generate(rgb,num_coarse_views,num_mcs_views,mcs_rect_w,mcs_steps):
23
+ # coarse
24
+ vistadream.scene = Gaussian_Scene(cfg)
25
+ # for trajectory genearation
26
+ vistadream.traj_type = 'spiral'
27
+ vistadream.scene.traj_type = 'spiral'
28
+ vistadream.n_sample = num_coarse_views
29
+ # for scene generation
30
+ vistadream.opt_iters_per_frame = 512
31
+ vistadream.outpaint_extend_times = 0.45 #outpaint_extend_times
32
+ vistadream.outpaint_selections = ['Left','Right','Top','Bottom']
33
+ # for scene refinement
34
+ vistadream.mcs_n_view = num_mcs_views
35
+ vistadream.mcs_rect_w = mcs_rect_w
36
+ vistadream.mcs_iterations = mcs_steps
37
+ # coarse scene
38
+ vistadream._coarse_scene(rgb)
39
+ torch.cuda.empty_cache()
40
+ # refinement
41
+ vistadream._MCS_Refinement()
42
+ output_path = get_temp_path()
43
+ torch.cuda.empty_cache()
44
+ torch.save(vistadream.scene,output_path+'scene.pth')
45
+ return output_path
46
+
47
+ def render_video(output_path):
48
+ scene = vistadream.scene
49
+ vistadream.checkor._render_video(scene,save_dir=output_path+'.')
50
+ return output_path+'video_rgb.mp4',output_path+'video_dpt.mp4'
51
+
52
+ def process(rgb,num_coarse_views,num_mcs_views,mcs_rect_w,mcs_steps):
53
+ path = scene_generate(rgb,num_coarse_views,num_mcs_views,mcs_rect_w,mcs_steps)
54
+ return render_video(path)
55
+
56
+ with gr.Blocks(analytics_enabled=False) as demo:
57
+ with gr.Column(elem_id="col-container"):
58
+ gr.Markdown("## VistaDream")
59
+ gr.Markdown("### Sampling multiview consistent images for single-view scene reconstruction")
60
+ gr.HTML("""
61
+ <div style="display:flex;column-gap:4px;">
62
+ <a href="https://github.com/WHU-USI3DV/VistaDream">
63
+ <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
64
+ </a>
65
+ <a href="https://vistadream-project-page.github.io/">
66
+ <img src='https://img.shields.io/badge/Project-Page-green'>
67
+ </a>
68
+ <a href="https://arxiv.org/abs/2410.16892">
69
+ <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
70
+ </a>
71
+ </div>
72
+ """)
73
+
74
+ with gr.Row():
75
+ with gr.Column():
76
+ input_image = gr.Image(type="pil")
77
+ run_button = gr.Button("Run")
78
+ with gr.Accordion("Advanced options", open=False):
79
+ num_coarse_views = gr.Slider(label="Coarse-Expand", minimum=5, maximum=25, value=10, step=1)
80
+ num_mcs_views = gr.Slider(label="MCS Optimization Views", minimum=4, maximum=10, value=8, step=1)
81
+ mcs_rect_w = gr.Slider(label="MCS Rectification Weight", minimum=0.3, maximum=0.8, value=0.7, step=0.1)
82
+ mcs_steps = gr.Slider(label="MCS Steps", minimum=8, maximum=15, value=10, step=1)
83
+ with gr.Column():
84
+ with gr.Row():
85
+ with gr.Column():
86
+ rgb_video = gr.Video("Output RGB renderings")
87
+ with gr.Column():
88
+ dpt_video = gr.Video("Output DPT renderings")
89
+ examples = gr.Examples(
90
+ examples = [
91
+ ['',
92
+ '',
93
+ ''],
94
+ ['',
95
+ '',
96
+ ''],
97
+ ['',
98
+ '',
99
+ '']
100
+ ],
101
+ inputs=[input_image,rgb_video,dpt_video]
102
+ )
103
+ ips = [input_image,num_coarse_views,num_mcs_views,mcs_rect_w,mcs_steps]
104
+ run_button.click(fn=process, inputs=ips, outputs=[rgb_video,dpt_video])
105
+
106
+ demo.launch(server_name='0.0.0.0')
ops/__init__.py ADDED
File without changes
ops/connect.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from copy import deepcopy
4
+ from ops.utils import dpt2xyz,transform_points
5
+
6
+ class Connect_Tool():
7
+ def __init__(self) -> None:
8
+ pass
9
+
10
+ def _align_scale_shift_numpy(self, pred: np.array, target: np.array):
11
+ mask = (target > 0) & (pred < 199)
12
+ target_mask = target[mask]
13
+ pred_mask = pred[mask]
14
+ if np.sum(mask) > 10:
15
+ scale, shift = np.polyfit(pred_mask, target_mask, deg=1)
16
+ if scale < 0:
17
+ scale = np.median(target[mask]) / (np.median(pred[mask]) + 1e-8)
18
+ shift = 0
19
+ else:
20
+ scale = 1
21
+ shift = 0
22
+ return scale,shift
23
+
24
+ def __call__(self, render_dpt, inpaint_dpt, inpaint_msk):
25
+ if np.sum(inpaint_msk > 0.5) < 1.: return render_dpt
26
+ # get areas need to be aligned
27
+ render_dpt_valid = render_dpt[~inpaint_msk]
28
+ inpaint_dpt_valid = inpaint_dpt[~inpaint_msk]
29
+ # rectify
30
+ scale,shift = self._align_scale_shift_numpy(inpaint_dpt_valid,render_dpt_valid)
31
+ inpaint_dpt = inpaint_dpt*scale + shift
32
+ return inpaint_dpt
33
+
34
+ class Smooth_Connect_Tool():
35
+ def __init__(self) -> None:
36
+ self.coarse_align = Connect_Tool()
37
+
38
+ def _coarse_alignment(self, render_dpt, ipaint_dpt, ipaint_msk):
39
+ # determine the scale and shift of inpaint_dpt to coarsely align it to render_dpt
40
+ inpaint_dpt = self.coarse_align(render_dpt,ipaint_dpt,ipaint_msk)
41
+ return inpaint_dpt
42
+
43
+ def _refine_movements(self, render_dpt, ipaint_dpt, ipaint_msk):
44
+ '''
45
+ Follow https://arxiv.org/pdf/2311.13384
46
+ '''
47
+ # Determine the adjustment of un-inpainted area
48
+ ipaint_msk = ipaint_msk>.5
49
+ H, W = ipaint_msk.shape[0:2]
50
+ U = np.arange(W)[None,:].repeat(H,axis=0)
51
+ V = np.arange(H)[:,None].repeat(W,axis=1)
52
+ # on kept areas
53
+ keep_render_dpt = render_dpt[~ipaint_msk]
54
+ keep_ipaint_dpt = ipaint_dpt[~ipaint_msk]
55
+ keep_adjust_dpt = keep_render_dpt - keep_ipaint_dpt
56
+ # iterative refinement
57
+ complete_adjust = np.zeros_like(ipaint_dpt)
58
+ for i in range(100):
59
+ complete_adjust[~ipaint_msk] = keep_adjust_dpt
60
+ complete_adjust = cv2.blur(complete_adjust,(15,15))
61
+ # complete_adjust[~ipaint_msk] = keep_adjust_dpt
62
+ ipaint_dpt = ipaint_dpt + complete_adjust
63
+ return ipaint_dpt
64
+
65
+ def _affine_dpt_to_GS(self, render_dpt, inpaint_dpt, inpaint_msk):
66
+ if np.sum(inpaint_msk > 0.5) < 1.: return render_dpt
67
+ inpaint_dpt = self._coarse_alignment(render_dpt,inpaint_dpt,inpaint_msk)
68
+ inpaint_dpt = self._refine_movements(render_dpt,inpaint_dpt,inpaint_msk)
69
+ return inpaint_dpt
70
+
71
+ def _scale_dpt_to_GS(self, render_dpt, inpaint_dpt, inpaint_msk):
72
+ if np.sum(inpaint_msk > 0.5) < 1.: return render_dpt
73
+ inpaint_dpt = self._refine_movements(render_dpt,inpaint_dpt,inpaint_msk)
74
+ return inpaint_dpt
75
+
76
+ class Occlusion_Removal():
77
+ def __init__(self) -> None:
78
+ pass
79
+
80
+ def __call__(self,scene,frame):
81
+ # first get xyz of the newly added frame
82
+ xyz = dpt2xyz(frame.dpt,frame.intrinsic)
83
+ # we only check newly added areas
84
+ xyz = xyz[frame.inpaint]
85
+ # move these xyzs to world coor system
86
+ inv_extrinsic = np.linalg.inv(frame.extrinsic)
87
+ xyz = transform_points(xyz,inv_extrinsic)
88
+ # we will add which pixels to the gaussian scene
89
+ msk = np.ones_like(xyz[...,0])
90
+ # project the xyzs to already built frames
91
+ for former_frame in scene.frames:
92
+ # xyz in camera frustrum
93
+ xyz_camera = transform_points(deepcopy(xyz),former_frame.extrinsic)
94
+ # uvz in camera frustrum
95
+ uvz_camera = np.einsum(f'ab,pb->pa',former_frame.intrinsic,xyz_camera)
96
+ # uv and d in camra frustrum
97
+ uv,d = uvz_camera[...,:2]/uvz_camera[...,-1:], uvz_camera[...,-1]
98
+ # in-frusturm pixels
99
+ valid_msk = (uv[...,0]>0) & (uv[...,0]<former_frame.W) & (uv[...,1]>0) & (uv[...,1]<former_frame.H) & (d>1e-2)
100
+ valid_idx = np.where(valid_msk)[0]
101
+ uv,d = uv[valid_idx].astype(np.uint32),d[valid_idx]
102
+ # make comparsion: compare_d < d is ok -- compare_d - d < 0(or a small number)
103
+ compare_d = former_frame.dpt[uv[:,1],uv[:,0]]
104
+ remove_msk = (compare_d-d)>(d+compare_d)/2./15.
105
+ # else to unvalid pixels
106
+ invalid_idx = valid_idx[remove_msk]
107
+ msk[invalid_idx] = 0.
108
+ # USE indexes rather than [][]
109
+ inpaint_idx_v,inpaint_idx_u = np.where(frame.inpaint)
110
+ inpaint_idx_v = inpaint_idx_v[msk<.5]
111
+ inpaint_idx_u = inpaint_idx_u[msk<.5]
112
+ frame.inpaint[inpaint_idx_v,inpaint_idx_u] = False
113
+ return frame
ops/depth_pro.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os,sys
2
+ currect = os.getcwd()
3
+ reference = f'{currect}/tools/DepthPro'
4
+ sys.path.append(reference)
5
+
6
+ from command_pro_dpt import apple_pro_depth
7
+
8
+ class Depth_Pro_Tool(apple_pro_depth):
9
+ def __init__(self, device='cuda', ckpt='/mnt/proj/SOTAs/ml-depth-pro-main/checkpoints/depth_pro.pt'):
10
+ super().__init__(device, ckpt)
11
+
12
+ def __call__(self, image, f_px=None):
13
+ return super().__call__(image, f_px)
ops/eval.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ from PIL import Image
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+ from ops.llava import Llava
6
+
7
+
8
+ class llava_iqa():
9
+ def __init__(self) -> None:
10
+ self._questions()
11
+ self.llava = Llava(device='cuda')
12
+
13
+ def _questions(self):
14
+ # quailty, noise, structure, texture
15
+ self.questions = {'noise-free':'Is the image free of noise or distortion',
16
+ 'sharp':'Does the image show clear objects and sharp edges',
17
+ 'structure':'Is the overall scene coherent and realistic in terms of layout and proportions in this image',
18
+ 'detail':'Does this image show detailed textures and materials',
19
+ 'quality':'Is this image overall a high quality image with clear objects, sharp edges, nice color, good overall structure, and good visual quailty'}
20
+
21
+ def _load_renderings(self,video_fn):
22
+ capturer = cv2.VideoCapture(video_fn)
23
+ frames = []
24
+ while True:
25
+ ret,frame = capturer.read()
26
+ if ret == False or frame is None: break
27
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
28
+ frame = Image.fromarray(frame.astype(np.uint8))
29
+ frames.append(frame)
30
+ # random sample...
31
+ idxs = np.random.permutation(len(frames))[0:50]
32
+ frames = [frames[i] for i in idxs]
33
+ return frames
34
+
35
+ def __call__(self,video_fn=f'data/vistadream/bust/video_rgb.mp4'):
36
+ results = {}
37
+ renderings = self._load_renderings(video_fn)
38
+ for key,question in self.questions.items():
39
+ results[key] = []
40
+ query = f'<image>\n USER: {question}, just anwser with yes or no? \n ASSISTANT: '
41
+ for rendering in renderings:
42
+ prompt = self.llava(rendering,query)
43
+ split = str.rfind(prompt,'ASSISTANT: ') + len(f'ASSISTANT: ')
44
+ prompt = prompt[split+1:]
45
+ if prompt[0:2] == 'Ye': results[key].append(1)
46
+ else: results[key].append(0)
47
+ for key,val in results.items:
48
+ results[key] = np.mean(np.array(val))
49
+ return results
50
+
51
+
ops/fooocus.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import os,sys
2
+ currect = os.getcwd()
3
+ reference = f'{currect}/tools/Fooocus'
4
+ sys.path.insert(0,reference)
5
+
6
+ from fooocus_command import Fooocus
ops/gs/__init__.py ADDED
File without changes
ops/gs/basic.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PIL
2
+ import torch
3
+ import numpy as np
4
+ import gsplat as gs
5
+ import torch.nn as nn
6
+ from copy import deepcopy
7
+ import torch.nn.functional as F
8
+ from dataclasses import dataclass
9
+ from ops.utils import (
10
+ dpt2xyz,
11
+ alpha_inpaint_mask,
12
+ transform_points,
13
+ numpy_normalize,
14
+ numpy_quaternion_from_matrix
15
+ )
16
+
17
+ @dataclass
18
+ class Frame():
19
+ '''
20
+ rgb: in shape of H*W*3, in range of 0-1
21
+ dpt: in shape of H*W, real depth
22
+ inpaint: bool mask in shape of H*W for inpainting
23
+ intrinsic: 3*3
24
+ extrinsic: array in shape of 4*4
25
+
26
+ As a class for:
27
+ initialize camera
28
+ accept rendering result
29
+ accept inpainting result
30
+ All at 2D-domain
31
+ '''
32
+ def __init__(self,
33
+ H: int = None,
34
+ W: int = None,
35
+ rgb: np.array = None,
36
+ dpt: np.array = None,
37
+ sky: np.array = None,
38
+ inpaint: np.array = None,
39
+ intrinsic: np.array = None,
40
+ extrinsic: np.array = None,
41
+ # detailed target
42
+ ideal_dpt: np.array = None,
43
+ ideal_nml: np.array = None,
44
+ prompt: str = None) -> None:
45
+ self.H = H
46
+ self.W = W
47
+ self.rgb = rgb
48
+ self.dpt = dpt
49
+ self.sky = sky
50
+ self.prompt = prompt
51
+ self.intrinsic = intrinsic
52
+ self.extrinsic = extrinsic
53
+ self._rgb_rect()
54
+ self._extr_rect()
55
+ # for inpainting
56
+ self.inpaint = inpaint
57
+ self.inpaint_wo_edge = inpaint
58
+ # for supervision
59
+ self.ideal_dpt = ideal_dpt
60
+ self.ideal_nml = ideal_nml
61
+
62
+ def _rgb_rect(self):
63
+ if self.rgb is not None:
64
+ if isinstance(self.rgb, PIL.PngImagePlugin.PngImageFile):
65
+ self.rgb = np.array(self.rgb)
66
+ if isinstance(self.rgb, PIL.JpegImagePlugin.JpegImageFile):
67
+ self.rgb = np.array(self.rgb)
68
+ if np.amax(self.rgb) > 1.1:
69
+ self.rgb = self.rgb / 255
70
+
71
+ def _extr_rect(self):
72
+ if self.extrinsic is None: self.extrinsic = np.eye(4)
73
+ self.inv_extrinsic = np.linalg.inv(self.extrinsic)
74
+
75
+ @dataclass
76
+ class Gaussian_Frame():
77
+ '''
78
+ In-frame-frustrum
79
+ Gaussians from a single RGBD frame
80
+ As a class for:
81
+ accept information from initialized/inpainting+geo-estimated frame
82
+ saving pixelsplat properties including rgb, xyz, scale, rotation, opacity; note here, we made a modification to xyz;
83
+ we first project depth to xyz
84
+ then we tune a scale map(initialized to ones) and a shift map(initialized to zeros), they are optimized and add to the original xyz when rendering
85
+ '''
86
+ # as pixelsplat guassian
87
+ rgb: torch.Tensor = None,
88
+ scale: torch.Tensor = None,
89
+ opacity: torch.Tensor = None,
90
+ rotation: torch.Tensor = None,
91
+ # gaussian center
92
+ dpt: torch.Tensor = None,
93
+ xyz: torch.Tensor = None,
94
+ # as a frame
95
+ H: int = 480,
96
+ W: int = 640,
97
+
98
+ def __init__(self, frame: Frame, device = 'cuda'):
99
+ '''after inpainting'''
100
+ # de-active functions
101
+ self.rgbs_deact = torch.logit
102
+ self.scales_deact = torch.log
103
+ self.opacity_deact = torch.logit
104
+ self.device = device
105
+ # for gaussian initialization
106
+ self._set_property_from_frame(frame)
107
+
108
+ def _to_3d(self):
109
+ # inv intrinsic
110
+ xyz = dpt2xyz(self.dpt,self.intrinsic)
111
+ inv_extrinsic = np.linalg.inv(self.extrinsic)
112
+ xyz = transform_points(xyz,inv_extrinsic)
113
+ return xyz
114
+
115
+ def _paint_filter(self,paint_mask):
116
+ if np.sum(paint_mask)<3:
117
+ paint_mask = np.zeros((self.H,self.W))
118
+ paint_mask[0:1] = 1
119
+ paint_mask = paint_mask>.5
120
+ self.rgb = self.rgb[paint_mask]
121
+ self.xyz = self.xyz[paint_mask]
122
+ self.scale = self.scale[paint_mask]
123
+ self.opacity = self.opacity[paint_mask]
124
+ self.rotation = self.rotation[paint_mask]
125
+
126
+ def _to_cuda(self):
127
+ self.rgb = torch.from_numpy(self.rgb.astype(np.float32)).to(self.device)
128
+ self.xyz = torch.from_numpy(self.xyz.astype(np.float32)).to(self.device)
129
+ self.scale = torch.from_numpy(self.scale.astype(np.float32)).to(self.device)
130
+ self.opacity = torch.from_numpy(self.opacity.astype(np.float32)).to(self.device)
131
+ self.rotation = torch.from_numpy(self.rotation.astype(np.float32)).to(self.device)
132
+
133
+ def _fine_init_scale_rotations(self):
134
+ # from https://arxiv.org/pdf/2406.09394
135
+ """ Compute rotation matrices that align z-axis with given normal vectors using matrix operations. """
136
+ up_axis = np.array([0,1,0])
137
+ nml = self.nml @ self.extrinsic[0:3,0:3]
138
+ qz = numpy_normalize(nml)
139
+ qx = np.cross(up_axis,qz)
140
+ qx = numpy_normalize(qx)
141
+ qy = np.cross(qz,qx)
142
+ qy = numpy_normalize(qy)
143
+ rot = np.concatenate([qx[...,None],qy[...,None],qz[...,None]],axis=-1)
144
+ self.rotation = numpy_quaternion_from_matrix(rot)
145
+ # scale
146
+ safe_nml = deepcopy(self.nml)
147
+ safe_nml[safe_nml[:,:,-1]<0.2,-1] = .2
148
+ normal_xoz = deepcopy(safe_nml)
149
+ normal_yoz = deepcopy(safe_nml)
150
+ normal_xoz[...,1] = 0.
151
+ normal_yoz[...,0] = 0.
152
+ normal_xoz = numpy_normalize(normal_xoz)
153
+ normal_yoz = numpy_normalize(normal_yoz)
154
+ cos_theta_x = np.abs(normal_xoz[...,2])
155
+ cos_theta_y = np.abs(normal_yoz[...,2])
156
+ scale_basic = self.dpt / self.intrinsic[0,0] / np.sqrt(2)
157
+ scale_x = scale_basic / cos_theta_x
158
+ scale_y = scale_basic / cos_theta_y
159
+ scale_z = (scale_x + scale_y) / 10.
160
+ self.scale = np.concatenate([scale_x[...,None],
161
+ scale_y[...,None],
162
+ scale_z[...,None]],axis=-1)
163
+
164
+ def _coarse_init_scale_rotations(self):
165
+ # gaussian property -- HW3 scale
166
+ self.scale = self.dpt / self.intrinsic[0,0] / np.sqrt(2)
167
+ self.scale = self.scale[:,:,None].repeat(3,-1)
168
+ # gaussian property -- HW4 rotation
169
+ self.rotation = np.zeros((self.H,self.W,4))
170
+ self.rotation[:,:,0] = 1.
171
+
172
+ def _set_property_from_frame(self,frame: Frame):
173
+ '''frame here is a complete init/inpainted frame'''
174
+ # basic frame-level property
175
+ self.H = frame.H
176
+ self.W = frame.W
177
+ self.dpt = frame.dpt
178
+ self.intrinsic = frame.intrinsic
179
+ self.extrinsic = frame.extrinsic
180
+ # gaussian property -- xyz with train-able pixel-aligned scale and shift
181
+ self.xyz = self._to_3d()
182
+ # gaussian property -- HW3 rgb
183
+ self.rgb = frame.rgb
184
+ # gaussian property -- HW4 rotation HW3 scale
185
+ self._coarse_init_scale_rotations()
186
+ # gaussian property -- HW opacity
187
+ self.opacity = np.ones((self.H,self.W,1)) * 0.8
188
+ # to cuda
189
+ self._paint_filter(frame.inpaint_wo_edge)
190
+ self._to_cuda()
191
+ # de-activate
192
+ self.rgb = self.rgbs_deact(self.rgb)
193
+ self.scale = self.scales_deact(self.scale)
194
+ self.opacity = self.opacity_deact(self.opacity)
195
+ # to torch parameters
196
+ self.rgb = nn.Parameter(self.rgb,requires_grad=False)
197
+ self.xyz = nn.Parameter(self.xyz,requires_grad=False)
198
+ self.scale = nn.Parameter(self.scale,requires_grad=False)
199
+ self.opacity = nn.Parameter(self.opacity,requires_grad=False)
200
+ self.rotation = nn.Parameter(self.rotation,requires_grad=False)
201
+
202
+ def _require_grad(self,sign=True):
203
+ self.rgb = self.rgb.requires_grad_(sign)
204
+ self.xyz = self.xyz.requires_grad_(sign)
205
+ self.scale = self.scale.requires_grad_(sign)
206
+ self.opacity = self.opacity.requires_grad_(sign)
207
+ self.rotation = self.rotation.requires_grad_(sign)
208
+
209
+ class Gaussian_Scene():
210
+ def __init__(self,cfg=None):
211
+ # frames initialing the frame
212
+ self.frames = []
213
+ self.gaussian_frames: list[Gaussian_Frame] = [] # gaussian frame require training at this optimization
214
+ # activate fuctions
215
+ self.rgbs_act = torch.sigmoid
216
+ self.scales_act = torch.exp
217
+ self.opacity_act = torch.sigmoid
218
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
219
+ # for traj generation
220
+ self.traj_type = 'spiral'
221
+ if cfg is not None:
222
+ self.traj_min_percentage = cfg.scene.traj.near_percentage
223
+ self.traj_max_percentage = cfg.scene.traj.far_percentage
224
+ self.traj_forward_ratio = cfg.scene.traj.traj_forward_ratio
225
+ self.traj_backward_ratio = cfg.scene.traj.traj_backward_ratio
226
+ else:
227
+ self.traj_min_percentage,self.traj_max_percentage,self.traj_forward_ratio,self.traj_backward_ratio = 5, 50, 0.3, 0.4
228
+
229
+ # basic operations
230
+ def _render_RGBD(self,frame,background_color='black'):
231
+ '''
232
+ :intinsic: tensor of [fu,fv,cu,cv] 4-dimension
233
+ :extinsic: tensor 4*4-dimension
234
+ :out: tensor H*W*3-dimension
235
+ '''
236
+ background = None
237
+ if background_color =='white':
238
+ background = torch.ones(1,4,device=self.device)*0.1
239
+ background[:,-1] = 0. # for depth
240
+ # aligned untrainable xyz and unaligned trainable xyz
241
+ # others
242
+ xyz = torch.cat([gf.xyz.reshape(-1,3) for gf in self.gaussian_frames],dim=0)
243
+ rgb = torch.cat([gf.rgb.reshape(-1,3) for gf in self.gaussian_frames],dim=0)
244
+ scale = torch.cat([gf.scale.reshape(-1,3) for gf in self.gaussian_frames],dim=0)
245
+ opacity = torch.cat([gf.opacity.reshape(-1) for gf in self.gaussian_frames],dim=0)
246
+ rotation = torch.cat([gf.rotation.reshape(-1,4) for gf in self.gaussian_frames],dim=0)
247
+ # activate
248
+ rgb = self.rgbs_act(rgb)
249
+ scale = self.scales_act(scale)
250
+ rotation = F.normalize(rotation,dim=1)
251
+ opacity = self.opacity_act(opacity)
252
+ # property
253
+ H,W = frame.H, frame.W
254
+ intrinsic = torch.from_numpy(frame.intrinsic.astype(np.float32)).to(self.device)
255
+ extrinsic = torch.from_numpy(frame.extrinsic.astype(np.float32)).to(self.device)
256
+ # render
257
+ render_out,render_alpha,_ = gs.rendering.rasterization(means = xyz,
258
+ scales = scale,
259
+ quats = rotation,
260
+ opacities = opacity,
261
+ colors = rgb,
262
+ Ks = intrinsic[None],
263
+ viewmats = extrinsic[None],
264
+ width = W,
265
+ height = H,
266
+ packed = False,
267
+ near_plane= 0.01,
268
+ render_mode="RGB+ED",
269
+ backgrounds=background) # render: 1*H*W*(3+1)
270
+ render_out = render_out.squeeze() # result: H*W*(3+1)
271
+ render_rgb = render_out[:,:,0:3]
272
+ render_dpt = render_out[:,:,-1]
273
+ return render_rgb, render_dpt, render_alpha
274
+
275
+ @torch.no_grad()
276
+ def _render_for_inpaint(self,frame):
277
+ # first render
278
+ render_rgb, render_dpt, render_alpha = self._render_RGBD(frame)
279
+ render_msk = alpha_inpaint_mask(render_alpha)
280
+ # to numpy
281
+ render_rgb = render_rgb.detach().cpu().numpy()
282
+ render_dpt = render_dpt.detach().cpu().numpy()
283
+ render_alpha = render_alpha.detach().cpu().numpy()
284
+ # assign back
285
+ frame.rgb = render_rgb
286
+ frame.dpt = render_dpt
287
+ frame.inpaint = render_msk
288
+ return frame
289
+
290
+ def _add_trainable_frame(self,frame:Frame,require_grad=True):
291
+ # for the init frame, we keep all pixels for finetuning
292
+ self.frames.append(frame)
293
+ gf = Gaussian_Frame(frame, self.device)
294
+ gf._require_grad(require_grad)
295
+ self.gaussian_frames.append(gf)
296
+
ops/gs/sh_utils.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+
4
+ C0 = 0.28209479177387814
5
+ C1 = 0.4886025119029199
6
+ C2 = [
7
+ 1.0925484305920792,
8
+ -1.0925484305920792,
9
+ 0.31539156525252005,
10
+ -1.0925484305920792,
11
+ 0.5462742152960396
12
+ ]
13
+ C3 = [
14
+ -0.5900435899266435,
15
+ 2.890611442640554,
16
+ -0.4570457994644658,
17
+ 0.3731763325901154,
18
+ -0.4570457994644658,
19
+ 1.445305721320277,
20
+ -0.5900435899266435
21
+ ]
22
+ C4 = [
23
+ 2.5033429417967046,
24
+ -1.7701307697799304,
25
+ 0.9461746957575601,
26
+ -0.6690465435572892,
27
+ 0.10578554691520431,
28
+ -0.6690465435572892,
29
+ 0.47308734787878004,
30
+ -1.7701307697799304,
31
+ 0.6258357354491761,
32
+ ]
33
+
34
+
35
+ def eval_sh(deg, sh, dirs):
36
+ """
37
+ Evaluate spherical harmonics at unit directions
38
+ using hardcoded SH polynomials.
39
+ Works with torch/np/jnp.
40
+ ... Can be 0 or more batch dimensions.
41
+ Args:
42
+ deg: int SH deg. Currently, 0-3 supported
43
+ sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
44
+ dirs: jnp.ndarray unit directions [..., 3]
45
+ Returns:
46
+ [..., C]
47
+ """
48
+ assert deg <= 4 and deg >= 0
49
+ coeff = (deg + 1) ** 2
50
+ assert sh.shape[-1] >= coeff
51
+
52
+ result = C0 * sh[..., 0]
53
+ if deg > 0:
54
+ x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
55
+ result = (result -
56
+ C1 * y * sh[..., 1] +
57
+ C1 * z * sh[..., 2] -
58
+ C1 * x * sh[..., 3])
59
+
60
+ if deg > 1:
61
+ xx, yy, zz = x * x, y * y, z * z
62
+ xy, yz, xz = x * y, y * z, x * z
63
+ result = (result +
64
+ C2[0] * xy * sh[..., 4] +
65
+ C2[1] * yz * sh[..., 5] +
66
+ C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] +
67
+ C2[3] * xz * sh[..., 7] +
68
+ C2[4] * (xx - yy) * sh[..., 8])
69
+
70
+ if deg > 2:
71
+ result = (result +
72
+ C3[0] * y * (3 * xx - yy) * sh[..., 9] +
73
+ C3[1] * xy * z * sh[..., 10] +
74
+ C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] +
75
+ C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] +
76
+ C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] +
77
+ C3[5] * z * (xx - yy) * sh[..., 14] +
78
+ C3[6] * x * (xx - 3 * yy) * sh[..., 15])
79
+
80
+ if deg > 3:
81
+ result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] +
82
+ C4[1] * yz * (3 * xx - yy) * sh[..., 17] +
83
+ C4[2] * xy * (7 * zz - 1) * sh[..., 18] +
84
+ C4[3] * yz * (7 * zz - 3) * sh[..., 19] +
85
+ C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] +
86
+ C4[5] * xz * (7 * zz - 3) * sh[..., 21] +
87
+ C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] +
88
+ C4[7] * xz * (xx - 3 * yy) * sh[..., 23] +
89
+ C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24])
90
+ return result
91
+
92
+ def RGB2SH(rgb):
93
+ return (rgb - 0.5) / C0
94
+
95
+ def SH2RGB(sh):
96
+ return sh * C0 + 0.5
ops/gs/train.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import tqdm
3
+ import torch
4
+ # import lpips
5
+ import numpy as np
6
+ from ops import utils
7
+ import torch.nn.functional as F
8
+ import torchvision.transforms as tvtf
9
+ from ops.gs.basic import Gaussian_Scene,Frame
10
+ from torchmetrics.image import StructuralSimilarityIndexMeasure
11
+
12
+ class RGB_Loss():
13
+ def __init__(self,w_lpips=0.2,w_ssim=0.2):
14
+ self.rgb_loss = F.smooth_l1_loss
15
+ # self.lpips_alex = lpips.LPIPS(net='alex').to('cuda')
16
+ self.ssim = StructuralSimilarityIndexMeasure(data_range=1.0).to('cuda')
17
+ self.w_ssim = w_ssim
18
+ self.w_lpips = w_lpips
19
+
20
+ def __call__(self,pr,gt,valid_mask=None):
21
+ pr = torch.nan_to_num(pr)
22
+ gt = torch.nan_to_num(gt)
23
+ if len(pr.shape) < 3: pr = pr[:,:,None].repeat(1,1,3)
24
+ if len(gt.shape) < 3: gt = gt[:,:,None].repeat(1,1,3)
25
+ pr_valid = pr[valid_mask] if valid_mask is not None else pr.reshape(-1,pr.shape[-1])
26
+ gt_valid = gt[valid_mask] if valid_mask is not None else gt.reshape(-1,gt.shape[-1])
27
+ l_rgb = self.rgb_loss(pr_valid,gt_valid)
28
+ l_ssim = 1.0 - self.ssim(pr[None].permute(0, 3, 1, 2), gt[None].permute(0, 3, 1, 2))
29
+ # l_lpips = self.lpips_alex(pr[None].permute(0, 3, 1, 2), gt[None].permute(0, 3, 1, 2))
30
+ return l_rgb + self.w_ssim * l_ssim
31
+
32
+ class GS_Train_Tool():
33
+ '''
34
+ Frames and well-trained gaussians are kept, refine the trainable gaussians
35
+ The supervision comes from the Frames of GS_Scene
36
+ '''
37
+ def __init__(self,
38
+ GS:Gaussian_Scene,
39
+ iters = 100) -> None:
40
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
41
+ # hyperparameters for prune, densify, and update
42
+ self.lr_factor = 1.00
43
+ self.lr_update = 0.99
44
+ # learning rate
45
+ self.rgb_lr = 0.0005
46
+ self.xyz_lr = 0.0001
47
+ self.scale_lr = 0.005
48
+ self.opacity_lr = 0.05
49
+ self.rotation_lr = 0.001
50
+ # GSs for training
51
+ self.GS = GS
52
+ # hyperparameters for training
53
+ self.iters = iters
54
+ self._init_optimizer()
55
+ self.rgb_lossfunc = RGB_Loss(w_lpips=0)
56
+
57
+ def _init_optimizer(self):
58
+ self.optimize_frames = [gf for gf in self.GS.gaussian_frames if gf.rgb.requires_grad]
59
+ # following https://github.com/pointrix-project/msplat
60
+ self.optimizer = torch.optim.Adam([
61
+ {'params': [gf.xyz for gf in self.optimize_frames], 'lr': self.xyz_lr},
62
+ {'params': [gf.rgb for gf in self.optimize_frames], 'lr': self.rgb_lr},
63
+ {'params': [gf.scale for gf in self.optimize_frames], 'lr': self.scale_lr},
64
+ {'params': [gf.opacity for gf in self.optimize_frames], 'lr': self.opacity_lr},
65
+ {'params': [gf.rotation for gf in self.optimize_frames], 'lr': self.rotation_lr}
66
+ ])
67
+
68
+ def _render(self,frame):
69
+ rgb,dpt,alpha = self.GS._render_RGBD(frame)
70
+ return rgb,dpt,alpha
71
+
72
+ def _to_cuda(self,tensor):
73
+ tensor = torch.from_numpy(tensor.astype(np.float32)).to('cuda')
74
+ return tensor
75
+
76
+ def __call__(self,target_frames=None):
77
+ target_frames = self.GS.frames if target_frames is None else target_frames
78
+ for iter in tqdm.tqdm(range(self.iters)):
79
+ frame_idx = np.random.randint(0,len(target_frames))
80
+ frame :Frame = target_frames[frame_idx]
81
+ render_rgb,render_dpt,render_alpha=self._render(frame)
82
+ loss_rgb = self.rgb_lossfunc(render_rgb,self._to_cuda(frame.rgb),valid_mask=frame.inpaint)
83
+ # optimization
84
+ loss = loss_rgb
85
+ loss.backward()
86
+ self.optimizer.step()
87
+ self.optimizer.zero_grad()
88
+ refined_scene = self.GS
89
+ for gf in refined_scene.gaussian_frames:
90
+ gf._require_grad(False)
91
+ return refined_scene
92
+
ops/llava.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PIL
2
+ import torch
3
+ import numpy as np
4
+ from transformers import AutoProcessor, LlavaForConditionalGeneration
5
+
6
+ class Llava():
7
+ def __init__(self,device='cuda',
8
+ llava_ckpt='llava-hf/bakLlava-v1-hf') -> None:
9
+ self.device = device
10
+ self.model_id = llava_ckpt
11
+ self.model = LlavaForConditionalGeneration.from_pretrained(
12
+ self.model_id,
13
+ torch_dtype=torch.float16,
14
+ low_cpu_mem_usage=True,
15
+ ).to(self.device)
16
+ self.processor = AutoProcessor.from_pretrained(self.model_id)
17
+
18
+ def __call__(self,image:PIL.Image, prompt=None):
19
+
20
+ # input check
21
+ if not isinstance(image,PIL.Image.Image):
22
+ if np.amax(image) < 1.1:
23
+ image = image * 255
24
+ image = image.astype(np.uint8)
25
+ image = PIL.Image.fromarray(image)
26
+
27
+ prompt = '<image>\n USER: Detaily imagine and describe the scene this image taken from? \n ASSISTANT: This image is taken from a scene of ' if prompt is None else prompt
28
+ inputs = self.processor(prompt, image, return_tensors='pt').to(self.model.device,torch.float16)
29
+ output = self.model.generate(**inputs, max_new_tokens=200, do_sample=False)
30
+ answer = self.processor.decode(output[0][2:], skip_special_tokens=True)
31
+ return answer
ops/mcs.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import torchvision.transforms as tvtf
4
+ from tools.StableDiffusion.Hack_SD_stepwise import Hack_SDPipe_Stepwise
5
+
6
+ '''
7
+ Input: Multiview images with added noise
8
+ denoise to x0
9
+ denoise from step t1 to step t2
10
+ '''
11
+
12
+ class HackSD_MCS():
13
+ '''
14
+ transform images to self.latents
15
+ add noise to self.latents
16
+ predict step noise --> x0
17
+ mv RGB-D warp as target image
18
+ target image encode to latent and get target noise
19
+ noise rectification
20
+ step denoise
21
+ '''
22
+ def __init__(self,device='cpu',use_lcm=True,denoise_steps=20,
23
+ sd_ckpt=f'tools/StableDiffusion/ckpt',
24
+ lcm_ckpt=f'latent-consistency/lcm-lora-sdv1-5') -> None:
25
+ '''
26
+ ref_rgb should be -1~1 tensor B*3*H*W
27
+ '''
28
+ self.device = device
29
+ self.target_type = np.float32
30
+ self.use_lcm = use_lcm
31
+ self.sd_ckpt = sd_ckpt
32
+ self.lcm_ckpt = lcm_ckpt
33
+ self._load_model()
34
+ # define step to add noise and steps to denoise
35
+ self.denoise_steps = denoise_steps
36
+ self.timesteps = self.model.timesteps
37
+
38
+ def _load_model(self):
39
+ self.model = Hack_SDPipe_Stepwise.from_pretrained(self.sd_ckpt)
40
+ self.model._use_lcm(self.use_lcm,self.lcm_ckpt)
41
+ self.model.re_init(num_inference_steps=50)
42
+ try:
43
+ self.model.enable_xformers_memory_efficient_attention()
44
+ except:
45
+ pass # run without xformers
46
+ self.model = self.model.to(self.device)
47
+
48
+ def to(self, device):
49
+ self.device = device
50
+ self.model.to(device)
51
+
52
+ @ torch.no_grad()
53
+ def _add_noise_to_latent(self,latents):
54
+ bsz = latents.shape[0]
55
+ # in the Stable Diffusion, the iterations numbers is 1000 for adding the noise and denosing.
56
+ timestep = self.timesteps[-self.denoise_steps]
57
+ timestep = timestep.repeat(bsz).to(self.device)
58
+ # target noise
59
+ noise = torch.randn_like(latents)
60
+ # add noise
61
+ noisy_latent = self.model.scheduler.add_noise(latents, noise, timestep)
62
+ # -------------------- noise for supervision -----------------
63
+ if self.model.scheduler.config.prediction_type == "epsilon":
64
+ target = noise
65
+ elif self.model.scheduler.config.prediction_type == "v_prediction":
66
+ target = self.model.scheduler.get_velocity(latents, noise, timestep)
67
+ return noisy_latent, timestep, target
68
+
69
+ @ torch.no_grad()
70
+ def _encode_mv_init_images(self, images):
71
+ '''
72
+ images should be B3HW
73
+ '''
74
+ images = images * 2 - 1
75
+ self.latents = self.model._encode(images)
76
+ self.latents,_,_ = self._add_noise_to_latent(self.latents)
77
+
78
+ @ torch.no_grad()
79
+ def _sd_forward(self, denoise_step, prompt_latent:torch.Tensor):
80
+ # temp noise prediction
81
+ t = self.timesteps[[-self.denoise_steps+denoise_step]].to(self.device)
82
+ noise_pred = self.model._step_noise(self.latents, t, prompt_latent.repeat(len(self.latents),1,1))
83
+ # solve image
84
+ _,x0 = self.model._solve_x0(self.latents,noise_pred,t)
85
+ x0 = (x0 + 1) / 2 # in 0-1
86
+ return t, noise_pred, x0
87
+
88
+
89
+ @ torch.no_grad()
90
+ def _denoise_to_x0(self, timestep_in_1000, prompt_latent:torch.Tensor):
91
+ # temp noise prediction
92
+ noise_pred = self.model._step_noise(self.latents, timestep_in_1000, prompt_latent.repeat(len(self.latents),1,1))
93
+ # solve image
94
+ _,x0 = self.model._solve_x0(self.latents,noise_pred,timestep_in_1000)
95
+ x0 = (x0 + 1) / 2 # in 0-1
96
+ return noise_pred, x0
97
+
98
+ @ torch.no_grad()
99
+ def _step_denoise(self, t, pred_noise, rect_x0, rect_w = 0.7):
100
+ '''
101
+ pred_noise B4H//8W//8
102
+ x0, rect_x0 B3HW
103
+ '''
104
+ # encoder rect_x0 to latent
105
+ rect_x0 = rect_x0 * 2 - 1
106
+ rect_latent = self.model._encode(rect_x0)
107
+ # rectified noise
108
+ rect_noise = self.model._solve_noise_given_x0_latent(self.latents,rect_latent,t)
109
+ # noise rectification
110
+ rect_noise = rect_noise / rect_noise.std(dim=list(range(1, rect_noise.ndim)),keepdim=True) \
111
+ * pred_noise.std(dim=list(range(1, pred_noise.ndim)),keepdim=True)
112
+ pred_noise = pred_noise*(1.-rect_w) + rect_noise*rect_w
113
+ # step forward
114
+ self.latents = self.model._step_denoise(self.latents,pred_noise,t)
115
+
116
+ @ torch.no_grad()
117
+ def _decode_mv_imgs(self):
118
+ imgs = self.model._decode(self.latents)
119
+ imgs = (imgs + 1) / 2
120
+ return imgs
121
+
ops/sky.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from transformers import OneFormerProcessor, OneFormerForUniversalSegmentation
4
+
5
+ class Sky_Seg_Tool():
6
+ def __init__(self,cfg):
7
+ self.processor = OneFormerProcessor.from_pretrained("shi-labs/oneformer_ade20k_swin_large")
8
+ self.model = OneFormerForUniversalSegmentation.from_pretrained("shi-labs/oneformer_ade20k_swin_large")
9
+
10
+ def __call__(self, img):
11
+ '''
12
+ input rgb should be numpy in range of 0-1 or 0-255
13
+ '''
14
+ # Semantic Segmentation
15
+ if np.amax(img) < 2: img = img*255
16
+ inputs = self.processor(images=img, task_inputs=["semantic"], return_tensors="pt")
17
+ outputs = self.model(**inputs)
18
+ # pass through image_processor for postprocessing
19
+ predicted_semantic_map = self.processor.post_process_semantic_segmentation(outputs, target_sizes=[img.size[::-1]])[0]
20
+ sky_msk = predicted_semantic_map.numpy() == 2
21
+ return sky_msk
22
+
ops/trajs/__init__.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from ops.sky import Sky_Seg_Tool
3
+ from ops.utils import dpt2xyz
4
+ from .spiral import spiral_camera_poses
5
+
6
+ class Trajectory_Generation():
7
+ def __init__(self,
8
+ scene = None,
9
+ method = 'spiral') -> None:
10
+ '''
11
+ method = 'spiral'/ rot' / 'spin'
12
+ '''
13
+ self._method = method
14
+ self.forward_ratio = scene.traj_forward_ratio
15
+ self.backward_ratio = scene.traj_backward_ratio
16
+ self.min_percentage = scene.traj_min_percentage
17
+ self.max_percentage = scene.traj_max_percentage
18
+
19
+ def _radius(self, xyz):
20
+ # get range
21
+ _min = np.percentile(xyz,self.min_percentage,axis=0)
22
+ _max = np.percentile(xyz,self.max_percentage,axis=0)
23
+ _range = _max - _min
24
+ # set radius to mean range of three axes
25
+ self.radius = np.mean(_range)
26
+
27
+ def _traj_spiral(self, nframe):
28
+ trajs = spiral_camera_poses(nframe, self.radius, self.forward_ratio, self.backward_ratio)
29
+ return trajs
30
+
31
+ def __call__(self, nframe, xyz):
32
+ if xyz.ndim > 2:
33
+ xyz = xyz.reshape(-1,3)
34
+ self._radius(xyz)
35
+ if self._method == 'rot':
36
+ trajs = self._traj_rot(nframe)
37
+ elif self._method == 'spin':
38
+ trajs = self._traj_spin(nframe)
39
+ elif self._method == 'spiral':
40
+ trajs = self._traj_spiral(nframe)
41
+ else:
42
+ raise TypeError('method = rot / spiral')
43
+ return trajs
44
+
45
+ def _generate_trajectory(cfg, scene, nframes=None):
46
+ method = scene.traj_type
47
+ nframe = cfg.scene.traj.n_sample*6 if nframes is None else nframes
48
+ sky,dpt,intrinsic = scene.frames[0].sky,scene.frames[0].dpt,scene.frames[0].intrinsic
49
+ xyz = dpt2xyz(dpt,intrinsic)
50
+ init_xyz = xyz[~sky]
51
+ generator = Trajectory_Generation(scene=scene,method=method)
52
+ traj = generator(nframe,init_xyz)
53
+ return traj
ops/trajs/spiral.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+ def generate_spiral_trajectory(num_frames, radius, forward_ratio=0.2, backward_ratio=0.8):
5
+ t = np.linspace(0, 1, num_frames)
6
+ r = np.sin(2 * np.pi * t) * radius
7
+ # rotation angles at each frame
8
+ theta = 2 * np.pi * t * num_frames
9
+ # try not to change y (up-down for floor and sky)
10
+ x = r * np.cos(theta)
11
+ y = r * np.sin(theta) * 0.3
12
+ z = -r
13
+ z[z<0]*=forward_ratio
14
+ z[z>0]*=backward_ratio
15
+ return x, y, z
16
+
17
+ def look_at(camera_position, target_position):
18
+ # look at direction
19
+ direction = target_position - camera_position
20
+ direction /= np.linalg.norm(direction)
21
+ # calculate rotation matrix
22
+ up = np.array([0, 1, 0])
23
+ right = np.cross(up, direction)
24
+ right /= np.linalg.norm(right)
25
+ up = np.cross(direction, right)
26
+ rotation_matrix = np.vstack([right, up, direction])
27
+ rotation_matrix = np.linalg.inv(rotation_matrix)
28
+ return rotation_matrix
29
+
30
+ def spiral_camera_poses(num_frames, radius, forward_ratio = 0.2, backward_ratio = 0.8, rotation_times = 0.3, look_at_times = 0.5):
31
+ x, y, z = generate_spiral_trajectory(num_frames, radius*rotation_times, forward_ratio, backward_ratio)
32
+ target_position = np.array([0,0,radius*look_at_times])
33
+ camera_positions = np.vstack([x, y, z]).T
34
+ camera_poses = []
35
+
36
+ for pos in camera_positions:
37
+ rotation_matrix = look_at(pos, target_position)
38
+ transform_matrix = np.eye(4)
39
+ transform_matrix[:3, :3] = rotation_matrix
40
+ transform_matrix[:3, 3] = pos
41
+ camera_poses.append(transform_matrix[None])
42
+
43
+ camera_poses.reverse()
44
+ camera_poses = np.concatenate(camera_poses,axis=0)
45
+
46
+ return camera_poses
ops/utils.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import torch
4
+ import matplotlib
5
+ import numpy as np
6
+ import open3d as o3d
7
+ from PIL import Image
8
+ from copy import deepcopy
9
+ from omegaconf import OmegaConf
10
+ from scipy.spatial import cKDTree
11
+
12
+ def gen_config(cfg_path):
13
+ return OmegaConf.load(cfg_path)
14
+
15
+ def get_focal_from_fov(new_fov, H, W):
16
+ # NOTE: top-left pixel should be (0,0)
17
+ if W >= H:
18
+ f = (W / 2.0) / np.tan(np.deg2rad(new_fov / 2.0))
19
+ else:
20
+ f = (H / 2.0) / np.tan(np.deg2rad(new_fov / 2.0))
21
+ return f
22
+
23
+ def get_intrins_from_fov(new_fov, H, W):
24
+ # NOTE: top-left pixel should be (0,0)
25
+ f = get_focal_from_fov(new_fov,H,W)
26
+
27
+ new_cu = (W / 2.0) - 0.5
28
+ new_cv = (H / 2.0) - 0.5
29
+
30
+ new_intrins = np.array([
31
+ [f, 0, new_cu ],
32
+ [0, f, new_cv ],
33
+ [0, 0, 1 ]
34
+ ])
35
+
36
+ return new_intrins
37
+
38
+ def dpt2xyz(dpt,intrinsic):
39
+ # get grid
40
+ height, width = dpt.shape[0:2]
41
+ grid_u = np.arange(width)[None,:].repeat(height,axis=0)
42
+ grid_v = np.arange(height)[:,None].repeat(width,axis=1)
43
+ grid = np.concatenate([grid_u[:,:,None],grid_v[:,:,None],np.ones_like(grid_v)[:,:,None]],axis=-1)
44
+ uvz = grid * dpt[:,:,None]
45
+ # inv intrinsic
46
+ inv_intrinsic = np.linalg.inv(intrinsic)
47
+ xyz = np.einsum(f'ab,hwb->hwa',inv_intrinsic,uvz)
48
+ return xyz
49
+
50
+ def dpt2xyz_torch(dpt,intrinsic):
51
+ # get grid
52
+ height, width = dpt.shape[0:2]
53
+ grid_u = torch.arange(width)[None,:].repeat(height,1)
54
+ grid_v = torch.arange(height)[:,None].repeat(1,width)
55
+ grid = torch.concatenate([grid_u[:,:,None],grid_v[:,:,None],torch.ones_like(grid_v)[:,:,None]],axis=-1).to(dpt)
56
+ uvz = grid * dpt[:,:,None]
57
+ # inv intrinsic
58
+ inv_intrinsic = torch.linalg.inv(intrinsic)
59
+ xyz = torch.einsum(f'ab,hwb->hwa',inv_intrinsic,uvz)
60
+ return xyz
61
+
62
+ def visual_pcd(xyz, color=None, normal = True):
63
+ if hasattr(xyz,'ndim'):
64
+ xyz_norm = np.mean(np.sqrt(np.sum(np.square(xyz),axis=1)))
65
+ xyz = xyz / xyz_norm
66
+ xyz = xyz.reshape(-1,3)
67
+ pcd = o3d.geometry.PointCloud()
68
+ pcd.points = o3d.utility.Vector3dVector(xyz)
69
+ else: pcd = xyz
70
+ if color is not None:
71
+ color = color.reshape(-1,3)
72
+ pcd.colors = o3d.utility.Vector3dVector(color)
73
+ if normal:
74
+ pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(0.2, 20))
75
+ o3d.visualization.draw_geometries([pcd])
76
+
77
+ def visual_pcds(xyzs, normal = True):
78
+ pcds = []
79
+ for xyz in xyzs:
80
+ if hasattr(xyz,'ndim'):
81
+ # xyz_norm = np.mean(np.sqrt(np.sum(np.square(xyz),axis=1)))
82
+ # xyz = xyz / xyz_norm
83
+ xyz = xyz.reshape(-1,3)
84
+ pcd = o3d.geometry.PointCloud()
85
+ pcd.points = o3d.utility.Vector3dVector(xyz)
86
+ pcd.paint_uniform_color(np.random.rand(3))
87
+ else: pcd = xyz
88
+ if normal:
89
+ pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(0.2, 20))
90
+ pcds.append(pcd)
91
+ o3d.visualization.draw_geometries(pcds)
92
+
93
+ def save_pic(input_pic:np.array,save_fn,normalize=True):
94
+ # avoid replace
95
+ pic = deepcopy(input_pic).astype(np.float32)
96
+ pic = np.nan_to_num(pic)
97
+ if normalize:
98
+ vmin = np.percentile(pic, 2)
99
+ vmax = np.percentile(pic, 98)
100
+ pic = (pic - vmin) / (vmax - vmin)
101
+ pic = (pic * 255.0).clip(0, 255)
102
+ if save_fn is not None:
103
+ pic_save = Image.fromarray(pic.astype(np.uint8))
104
+ pic_save.save(save_fn)
105
+ return pic
106
+
107
+ def depth_colorize(dpt,sky_mask=None):
108
+ cm = matplotlib.colormaps["Spectral"]
109
+ depth = dpt_normalize(dpt,sky_mask)
110
+ img_colored_np = cm(depth, bytes=False)[:, :, 0:3] # value from 0 to 1
111
+ return img_colored_np
112
+
113
+ def dpt_normalize(dpt, sky_mask = None):
114
+ if sky_mask is not None:
115
+ pic = dpt[~sky_mask]
116
+ else:
117
+ pic = dpt
118
+ vmin = np.percentile(pic, 2)
119
+ vmax = np.percentile(pic, 98)
120
+ dpt = (deepcopy(dpt) - vmin) / (vmax - vmin)
121
+ if sky_mask is not None:
122
+ dpt[sky_mask] = 1.
123
+ return dpt
124
+
125
+ def transform_points(pts,transform):
126
+ h,w=transform.shape
127
+ if h==3 and w==3:
128
+ return pts @ transform.T
129
+ if h==3 and w==4:
130
+ return pts @ transform[:,:3].T + transform[:,3:].T
131
+ elif h==4 and w==4:
132
+ return pts @ transform[0:3,:3].T + transform[0:3,3:].T
133
+ else: raise NotImplementedError
134
+
135
+ def get_nml_from_quant(quant):
136
+ '''
137
+ input N*4
138
+ outut N*3
139
+ follow https://arxiv.org/pdf/2404.17774
140
+ '''
141
+ w=quant[:,0]
142
+ x=quant[:,1]
143
+ y=quant[:,2]
144
+ z=quant[:,3]
145
+ n0 = 2*x*z+2*y*w
146
+ n1 = 2*y*z-2*x*w
147
+ n2 = 1-2*x*x-2*y*y
148
+ nml = torch.cat((n0[:,None],n1[:,None],n2[:,None]),dim=1)
149
+ return nml
150
+
151
+ def quaternion_from_matrix(M):
152
+ m00 = M[..., 0, 0]
153
+ m01 = M[..., 0, 1]
154
+ m02 = M[..., 0, 2]
155
+ m10 = M[..., 1, 0]
156
+ m11 = M[..., 1, 1]
157
+ m12 = M[..., 1, 2]
158
+ m20 = M[..., 2, 0]
159
+ m21 = M[..., 2, 1]
160
+ m22 = M[..., 2, 2]
161
+ K = torch.zeros((len(M),4,4)).to(M)
162
+ K[:,0,0] = m00 - m11 - m22
163
+ K[:,1,0] = m01 + m10
164
+ K[:,1,1] = m11 - m00 - m22
165
+ K[:,2,0] = m02 + m20
166
+ K[:,2,1] = m12 + m21
167
+ K[:,2,2] = m22 - m00 - m11
168
+ K[:,3,0] = m21 - m12
169
+ K[:,3,1] = m02 - m20
170
+ K[:,3,2] = m10 - m01
171
+ K[:,3,3] = m00 + m11 + m22
172
+ K = K/3
173
+ # quaternion is eigenvector of K that corresponds to largest eigenvalue
174
+ w, V = torch.linalg.eigh(K)
175
+ q = V[torch.arange(len(V)),:,torch.argmax(w,dim=1)]
176
+ q = q[:,[3, 0, 1, 2]]
177
+ for i in range(len(q)):
178
+ if q[i,0]<0.:
179
+ q[i] = -q[i]
180
+ return q
181
+
182
+ def numpy_quaternion_from_matrix(M):
183
+ H,W = M.shape[0:2]
184
+ M = M.reshape(-1,3,3)
185
+ m00 = M[..., 0, 0]
186
+ m01 = M[..., 0, 1]
187
+ m02 = M[..., 0, 2]
188
+ m10 = M[..., 1, 0]
189
+ m11 = M[..., 1, 1]
190
+ m12 = M[..., 1, 2]
191
+ m20 = M[..., 2, 0]
192
+ m21 = M[..., 2, 1]
193
+ m22 = M[..., 2, 2]
194
+ K = np.zeros((len(M),4,4))
195
+ K[...,0,0] = m00 - m11 - m22
196
+ K[...,1,0] = m01 + m10
197
+ K[...,1,1] = m11 - m00 - m22
198
+ K[...,2,0] = m02 + m20
199
+ K[...,2,1] = m12 + m21
200
+ K[...,2,2] = m22 - m00 - m11
201
+ K[...,3,0] = m21 - m12
202
+ K[...,3,1] = m02 - m20
203
+ K[...,3,2] = m10 - m01
204
+ K[...,3,3] = m00 + m11 + m22
205
+ K = K/3
206
+ # quaternion is eigenvector of K that corresponds to largest eigenvalue
207
+ w, V = np.linalg.eigh(K)
208
+ q = V[np.arange(len(V)),:,np.argmax(w,axis=1)]
209
+ q = q[...,[3, 0, 1, 2]]
210
+ for i in range(len(q)):
211
+ if q[i,0]<0.:
212
+ q[i] = -q[i]
213
+ q = q.reshape(H,W,4)
214
+ return q
215
+
216
+ def numpy_normalize(input):
217
+ input = input / (np.sqrt(np.sum(np.square(input),axis=-1,keepdims=True))+1e-5)
218
+ return input
219
+
220
+ class suppress_stdout_stderr(object):
221
+ '''
222
+ Avoid terminal output of diffusion processings!
223
+ A context manager for doing a "deep suppression" of stdout and stderr in
224
+ Python, i.e. will suppress all print, even if the print originates in a
225
+ compiled C/Fortran sub-function.
226
+ This will not suppress raised exceptions, since exceptions are printed
227
+ to stderr just before a script exits, and after the context manager has
228
+ exited (at least, I think that is why it lets exceptions through).
229
+
230
+ '''
231
+ def __init__(self):
232
+ # Open a pair of null files
233
+ self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
234
+ # Save the actual stdout (1) and stderr (2) file descriptors.
235
+ self.save_fds = (os.dup(1), os.dup(2))
236
+
237
+ def __enter__(self):
238
+ # Assign the null pointers to stdout and stderr.
239
+ os.dup2(self.null_fds[0], 1)
240
+ os.dup2(self.null_fds[1], 2)
241
+
242
+ def __exit__(self, *_):
243
+ # Re-assign the real stdout/stderr back to (1) and (2)
244
+ os.dup2(self.save_fds[0], 1)
245
+ os.dup2(self.save_fds[1], 2)
246
+ # Close the null files
247
+ os.close(self.null_fds[0])
248
+ os.close(self.null_fds[1])
249
+
250
+ import torch.nn.functional as F
251
+ def nei_delta(input,pad=2):
252
+ if not type(input) is torch.Tensor:
253
+ input = torch.from_numpy(input.astype(np.float32))
254
+ if len(input.shape) < 3:
255
+ input = input[:,:,None]
256
+ h,w,c = input.shape
257
+ # reshape
258
+ input = input.permute(2,0,1)[None]
259
+ input = F.pad(input, pad=(pad,pad,pad,pad), mode='replicate')
260
+ kernel = 2*pad + 1
261
+ input = F.unfold(input,[kernel,kernel],padding=0)
262
+ input = input.reshape(c,-1,h,w).permute(2,3,0,1).squeeze() # hw(3)*25
263
+ return torch.amax(input,dim=-1),torch.amin(input,dim=-1),input
264
+
265
+ def inpaint_mask(render_dpt,render_rgb):
266
+
267
+ # edge filter delta thres
268
+ valid_dpt = render_dpt[render_dpt>1e-3]
269
+ valid_dpt = torch.sort(valid_dpt).values
270
+ max = valid_dpt[int(.85*len(valid_dpt))]
271
+ min = valid_dpt[int(.15*len(valid_dpt))]
272
+ ths = (max-min) * 0.2
273
+ # nei check
274
+ nei_max, nei_min, _ = nei_delta(render_dpt,pad=1)
275
+ edge_mask = (nei_max - nei_min) > ths
276
+ # render hole
277
+ hole_mask = render_dpt < 1e-3
278
+ # whole mask -- original noise and sparse
279
+ mask = edge_mask | hole_mask
280
+ mask = mask.cpu().float().numpy()
281
+
282
+ # modify rgb sightly for small holes : blur and sharpen
283
+ render_rgb = render_rgb.detach().cpu().numpy()
284
+ render_rgb = (render_rgb*255).astype(np.uint8)
285
+ render_rgb_blur = cv2.medianBlur(render_rgb,5)
286
+ render_rgb[mask>.5] = render_rgb_blur[mask>.5] # blur and replace small holes
287
+ render_rgb = torch.from_numpy((render_rgb/255).astype(np.float32)).to(render_dpt)
288
+
289
+ # slightly clean mask
290
+ kernel = np.ones((5,5),np.uint8)
291
+ mask = cv2.erode(mask,kernel,iterations=2)
292
+ mask = cv2.dilate(mask,kernel,iterations=7)
293
+ mask = mask > 0.5
294
+
295
+ return mask,render_rgb
296
+
297
+ def alpha_inpaint_mask(render_alpha):
298
+ render_alpha = render_alpha.detach().squeeze().cpu().numpy()
299
+ paint_mask = 1.-np.around(render_alpha)
300
+ # slightly clean mask
301
+ kernel = np.ones((5,5),np.uint8)
302
+ paint_mask = cv2.erode(paint_mask,kernel,iterations=1)
303
+ paint_mask = cv2.dilate(paint_mask,kernel,iterations=3)
304
+ paint_mask = paint_mask > 0.5
305
+ return paint_mask
306
+
307
+ def edge_filter(metric_dpt,sky=None,times=0.1):
308
+ sky = np.zeros_like(metric_dpt,bool) if sky is None else sky
309
+ _max = np.percentile(metric_dpt[~sky],95)
310
+ _min = np.percentile(metric_dpt[~sky], 5)
311
+ _range = _max - _min
312
+ nei_max,nei_min,_ = nei_delta(metric_dpt)
313
+ delta = (nei_max-nei_min).numpy()
314
+ edge = delta > times*_range
315
+ return edge
316
+
317
+ def fill_mask_with_nearest(imgs, mask):
318
+ # mask and un-mask pixel coors
319
+ mask_coords = np.column_stack(np.where(mask > .5))
320
+ non_mask_coords = np.column_stack(np.where(mask < .5))
321
+ # kd-tree on un-masked pixels
322
+ tree = cKDTree(non_mask_coords)
323
+ # nn search of masked pixels
324
+ _, idxs = tree.query(mask_coords)
325
+ # replace and fill
326
+ for i, coord in enumerate(mask_coords):
327
+ nearest_coord = non_mask_coords[idxs[i]]
328
+ for img in imgs:
329
+ img[coord[0], coord[1]] = img[nearest_coord[0], nearest_coord[1]]
330
+ return imgs
331
+
332
+ def edge_rectify(metric_dpt,rgb,sky=None):
333
+ edge = edge_filter(metric_dpt,sky)
334
+ process_rgb = deepcopy(rgb)
335
+ metric_dpt,process_rgb = fill_mask_with_nearest([metric_dpt,process_rgb],edge)
336
+ return metric_dpt,process_rgb
337
+
338
+ from plyfile import PlyData, PlyElement
339
+ def color2feat(color):
340
+ max_sh_degree = 3
341
+ fused_color = (color-0.5)/0.28209479177387814
342
+ features = np.zeros((fused_color.shape[0], 3, (max_sh_degree + 1) ** 2))
343
+ features = torch.from_numpy(features.astype(np.float32))
344
+ features[:, :3, 0 ] = fused_color
345
+ features[:, 3:, 1:] = 0.0
346
+ features_dc = features[:,:,0:1]
347
+ features_rest = features[:,:,1: ]
348
+ return features_dc,features_rest
349
+
350
+ def construct_list_of_attributes(features_dc,features_rest,scale,rotation):
351
+ l = ['x', 'y', 'z', 'nx', 'ny', 'nz']
352
+ # All channels except the 3 DC
353
+ for i in range(features_dc.shape[1]*features_dc.shape[2]):
354
+ l.append('f_dc_{}'.format(i))
355
+ for i in range(features_rest.shape[1]*features_rest.shape[2]):
356
+ l.append('f_rest_{}'.format(i))
357
+ l.append('opacity')
358
+ for i in range(scale.shape[1]):
359
+ l.append('scale_{}'.format(i))
360
+ for i in range(rotation.shape[1]):
361
+ l.append('rot_{}'.format(i))
362
+ return l
363
+
364
+ def save_ply(scene,path):
365
+ xyz = torch.cat([gf.xyz.reshape(-1,3) for gf in scene.gaussian_frames],dim=0).detach().cpu().numpy()
366
+ scale = torch.cat([gf.scale.reshape(-1,3) for gf in scene.gaussian_frames],dim=0).detach().cpu().numpy()
367
+ opacities = torch.cat([gf.opacity.reshape(-1) for gf in scene.gaussian_frames],dim=0)[:,None].detach().cpu().numpy()
368
+ rotation = torch.cat([gf.rotation.reshape(-1,4) for gf in scene.gaussian_frames],dim=0).detach().cpu().numpy()
369
+ rgb = torch.sigmoid(torch.cat([gf.rgb.reshape(-1,3) for gf in scene.gaussian_frames],dim=0))
370
+ # rgb
371
+ features_dc, features_rest = color2feat(rgb)
372
+ f_dc = features_dc.flatten(start_dim=1).detach().cpu().numpy()
373
+ f_rest = features_rest.flatten(start_dim=1).detach().cpu().numpy()
374
+ normals = np.zeros_like(xyz)
375
+ # save
376
+ dtype_full = [(attribute, 'f4') for attribute in construct_list_of_attributes(features_dc,features_rest,scale,rotation)]
377
+ elements = np.empty(xyz.shape[0], dtype=dtype_full)
378
+ attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1)
379
+ elements[:] = list(map(tuple, attributes))
380
+ el = PlyElement.describe(elements, 'vertex')
381
+ PlyData([el]).write(path)
ops/visual_check.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import imageio
2
+ import matplotlib
3
+ from ops.utils import *
4
+ from ops.gs.basic import *
5
+ from ops.trajs import _generate_trajectory
6
+
7
+ class Check():
8
+ def __init__(self) -> None:
9
+ pass
10
+
11
+ def _visual_pcd(self,scene:Gaussian_Scene):
12
+ xyzs,rgbs = [],[]
13
+ for i,gf in enumerate(scene.gaussian_frames):
14
+ xyz = gf.xyz.detach().cpu().numpy()
15
+ rgb = torch.sigmoid(gf.rgb).detach().cpu().numpy()
16
+ opacity = gf.opacity.detach().squeeze().cpu().numpy() > 1e-5
17
+ xyzs.append(xyz[opacity])
18
+ rgbs.append(rgb[opacity])
19
+ xyzs = np.concatenate(xyzs,axis=0)
20
+ rgbs = np.concatenate(rgbs,axis=0)
21
+ visual_pcd(xyzs,color=rgbs,normal=True)
22
+
23
+ @torch.no_grad()
24
+ def _render_video(self,scene:Gaussian_Scene,save_dir='./'):
25
+ # render 5times frames
26
+ nframes = len(scene.frames)*25
27
+ video_trajs = _generate_trajectory(None,scene,nframes=nframes)
28
+ H,W,intrinsic = scene.frames[0].H,scene.frames[0].W,deepcopy(scene.frames[0].intrinsic)
29
+ if H<W:
30
+ if H>512:
31
+ ratio = 512/H
32
+ W,H = int(W*ratio),int(H*ratio)
33
+ intrinsic[0:2] = intrinsic[0:2]*ratio
34
+ else:
35
+ if W>512:
36
+ ratio = 512/W
37
+ W,H = int(W*ratio),int(H*ratio)
38
+ intrinsic[0:2] = intrinsic[0:2]*ratio
39
+ # render
40
+ rgbs,dpts = [],[]
41
+ print(f'[INFO] rendering final video with {nframes} frames...')
42
+ for pose in video_trajs:
43
+ frame = Frame(H=H,W=W,
44
+ intrinsic=intrinsic,
45
+ extrinsic=pose)
46
+ rgb,dpt,alpha = scene._render_RGBD(frame)
47
+ rgb = rgb.detach().float().cpu().numpy()
48
+ dpt = dpt.detach().float().cpu().numpy()
49
+ dpts.append(dpt)
50
+ rgbs.append((rgb * 255).astype(np.uint8))
51
+ rgbs = np.stack(rgbs, axis=0)
52
+ dpts = np.stack(dpts, axis=0)
53
+ valid_dpts = dpts[dpts>0.]
54
+ _min = np.percentile(valid_dpts, 1)
55
+ _max = np.percentile(valid_dpts,99)
56
+ dpts = (dpts-_min) / (_max-_min)
57
+ dpts = dpts.clip(0,1)
58
+
59
+ cm = matplotlib.colormaps["plasma"]
60
+ dpts_color = cm(dpts,bytes=False)[...,0:3]
61
+ dpts_color = (dpts_color*255).astype(np.uint8)
62
+
63
+ imageio.mimwrite(f'{save_dir}video_rgb.mp4',rgbs,fps=20)
64
+ imageio.mimwrite(f'{save_dir}video_dpt.mp4',dpts_color,fps=20)
pipe/__init__.py ADDED
File without changes
pipe/c2f_recons.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ render using frames in GS
3
+ inpaint with fooocus
4
+ '''
5
+ import os
6
+ import torch
7
+ import numpy as np
8
+ from PIL import Image
9
+ from copy import deepcopy
10
+ from ops.utils import *
11
+ from ops.sky import Sky_Seg_Tool
12
+ from ops.visual_check import Check
13
+ from ops.gs.train import GS_Train_Tool
14
+ from pipe.lvm_inpaint import Inpaint_Tool
15
+ from pipe.reconstruct import Reconstruct_Tool
16
+ from ops.trajs import _generate_trajectory
17
+ from ops.connect import Occlusion_Removal
18
+ from ops.gs.basic import Frame,Gaussian_Scene
19
+
20
+ from ops.mcs import HackSD_MCS
21
+ from pipe.refine_mvdps import Refinement_Tool_MCS
22
+
23
+ class Pipeline():
24
+ def __init__(self,cfg) -> None:
25
+ self.device = 'cuda'
26
+ self.cfg = cfg
27
+ self.sky_value = cfg.model.sky.value
28
+ self.sky_segor = Sky_Seg_Tool(cfg)
29
+ self.rgb_inpaintor = Inpaint_Tool(cfg)
30
+ self.reconstructor = Reconstruct_Tool(cfg)
31
+ # temp
32
+ self.removalor = Occlusion_Removal()
33
+ self.checkor = Check()
34
+
35
+ def _mkdir(self,dir):
36
+ if not os.path.exists(dir):
37
+ os.makedirs(dir)
38
+
39
+ def _resize_input(self,fn):
40
+ resize_long_edge = int(self.cfg.scene.input.resize_long_edge)
41
+ print(f'[Preprocess...] Resize the long edge of input image to {resize_long_edge}.')
42
+ spl = str.rfind(fn,'.')
43
+ backup_fn = fn[:spl] + '.original' + fn[spl:]
44
+ rgb = Image.open(fn)
45
+ rgb.save(backup_fn) # back up original image
46
+ rgb = np.array(rgb)[:,:,:3]/255.
47
+ H,W = rgb.shape[0:2]
48
+ if H>W:
49
+ W = int(W*resize_long_edge/H)
50
+ H = resize_long_edge
51
+ else:
52
+ H = int(H*resize_long_edge/W)
53
+ W = resize_long_edge
54
+ rgb = cv2.resize(rgb,(W,H))
55
+ pic = (rgb * 255.0).clip(0, 255)
56
+ pic_save = Image.fromarray(pic.astype(np.uint8))
57
+ pic_save.save(fn)
58
+
59
+ def _initialization(self,rgb):
60
+ rgb = np.array(rgb)[:,:,:3]
61
+ # conduct outpainting on rgb and change cu,cv
62
+ outpaint_frame :Frame = self.rgb_inpaintor(Frame(rgb=rgb),
63
+ outpaint_selections=self.outpaint_selections,
64
+ outpaint_extend_times=self.outpaint_extend_times)
65
+ # conduct reconstruction on outpaint results
66
+ _,intrinsic,_ = self.reconstructor._ProDpt_(rgb) # estimate focal on input view
67
+ metric_dpt,intrinsic,edge_msk = self.reconstructor._ProDpt_(outpaint_frame.rgb)
68
+ outpaint_frame.intrinsic = deepcopy(intrinsic)
69
+ # split to input and outpaint areas
70
+ input_frame = Frame(H=rgb.shape[0],
71
+ W=rgb.shape[1],
72
+ rgb=rgb,
73
+ intrinsic=deepcopy(intrinsic),
74
+ extrinsic=np.eye(4))
75
+ input_frame.intrinsic[0,-1] = input_frame.W/2.
76
+ input_frame.intrinsic[1,-1] = input_frame.H/2.
77
+ # others
78
+ input_area = ~outpaint_frame.inpaint
79
+ input_edg = edge_msk[input_area].reshape(input_frame.H,input_frame.W)
80
+ input_dpt = metric_dpt[input_area].reshape(input_frame.H,input_frame.W)
81
+ sky = self.sky_segor(input_frame.rgb)
82
+ input_frame.sky = sky
83
+ input_dpt[sky] = self.sky_value
84
+ input_frame.dpt = input_dpt
85
+ input_frame.inpaint = np.ones_like(input_edg,bool) & (~sky)
86
+ input_frame.inpaint_wo_edge = (~input_edg) & (~sky)
87
+ input_frame.ideal_dpt = deepcopy(input_dpt)
88
+ input_frame.prompt = outpaint_frame.prompt
89
+ # outpaint frame
90
+ sky = self.sky_segor(outpaint_frame.rgb)
91
+ outpaint_frame.sky = sky
92
+ metric_dpt[sky] = self.sky_value
93
+ outpaint_frame.dpt = metric_dpt
94
+ outpaint_frame.ideal_dpt = deepcopy(metric_dpt)
95
+ outpaint_frame.inpaint = (outpaint_frame.inpaint)&(~sky)
96
+ outpaint_frame.inpaint_wo_edge = (outpaint_frame.inpaint)&(~edge_msk)
97
+ # add init frame
98
+ self.scene._add_trainable_frame(input_frame,require_grad=True)
99
+ self.scene._add_trainable_frame(outpaint_frame,require_grad=True)
100
+ self.scene = GS_Train_Tool(self.scene,iters=100)(self.scene.frames)
101
+
102
+ def _generate_traj(self):
103
+ self.dense_trajs = _generate_trajectory(self.cfg,self.scene)
104
+
105
+ def _pose_to_frame(self,extrinsic,margin=32):
106
+ H = self.scene.frames[0].H + margin
107
+ W = self.scene.frames[0].W + margin
108
+ prompt = self.scene.frames[-1].prompt
109
+ intrinsic = deepcopy(self.scene.frames[0].intrinsic)
110
+ intrinsic[0,-1], intrinsic[1,-1] = W/2, H/2
111
+ frame = Frame(H=H,W=W,intrinsic=intrinsic,extrinsic=extrinsic,prompt=prompt)
112
+ frame = self.scene._render_for_inpaint(frame)
113
+ return frame
114
+
115
+ def _next_frame(self,margin=32):
116
+ # select the frame with largest holes but less than 60%
117
+ inpaint_area_ratio = []
118
+ for pose in self.dense_trajs:
119
+ temp_frame = self._pose_to_frame(pose,margin)
120
+ inpaint_mask = temp_frame.inpaint
121
+ inpaint_area_ratio.append(np.mean(inpaint_mask))
122
+ inpaint_area_ratio = np.array(inpaint_area_ratio)
123
+ inpaint_area_ratio[inpaint_area_ratio > 0.6] = 0.
124
+ # remove adjustancy frames
125
+ for s in self.select_frames:
126
+ inpaint_area_ratio[s] = 0.
127
+ if s-1>-1:
128
+ inpaint_area_ratio[s-1] = 0.
129
+ if s+1<len(self.dense_trajs):
130
+ inpaint_area_ratio[s+1] = 0.
131
+ # select the largest ones
132
+ select = np.argmax(inpaint_area_ratio)
133
+ if inpaint_area_ratio[select] < 0.0001: return None
134
+ self.select_frames.append(select)
135
+ pose = self.dense_trajs[select]
136
+ frame = self._pose_to_frame(pose,margin)
137
+ return frame
138
+
139
+ def _inpaint_next_frame(self,margin=32):
140
+ frame = self._next_frame(margin)
141
+ if frame is None: return None
142
+ # inpaint rgb
143
+ frame = self.rgb_inpaintor(frame)
144
+ # inpaint dpt
145
+ connect_dpt,metric_dpt,_,edge_msk = self.reconstructor._Guide_ProDpt_(frame.rgb,frame.intrinsic,frame.dpt,~frame.inpaint)
146
+ frame.dpt = connect_dpt
147
+ frame = self.removalor(self.scene,frame)
148
+ sky = self.sky_segor(frame.rgb)
149
+ frame.sky = sky
150
+ frame.dpt[sky] = self.sky_value
151
+ frame.inpaint = (frame.inpaint) & (~sky)
152
+ frame.inpaint_wo_edge = (frame.inpaint) & (~edge_msk)
153
+ # determine target depth and normal
154
+ frame.ideal_dpt = metric_dpt
155
+ self.scene._add_trainable_frame(frame)
156
+ return 0
157
+
158
+ def _coarse_scene(self,rgb):
159
+ self._initialization(rgb)
160
+ self._generate_traj()
161
+ self.select_frames = []
162
+ for i in range(self.n_sample-2):
163
+ print(f'Procecssing {i+2}/{self.n_sample} frame...')
164
+ sign = self._inpaint_next_frame()
165
+ if sign is None: break
166
+ self.scene = GS_Train_Tool(self.scene,iters=self.opt_iters_per_frame)(self.scene.frames)
167
+
168
+ def _MCS_Refinement(self):
169
+ refiner = HackSD_MCS(device='cuda',use_lcm=True,denoise_steps=self.mcs_iterations,
170
+ sd_ckpt=self.cfg.model.optimize.sd,
171
+ lcm_ckpt=self.cfg.model.optimize.lcm)
172
+ self.MVDPS = Refinement_Tool_MCS(self.scene,device='cuda',
173
+ refiner=refiner,
174
+ traj_type=self.traj_type,
175
+ n_view=self.mcs_n_view,
176
+ rect_w=self.mcs_rect_w,
177
+ n_gsopt_iters=self.mcs_gsopt_per_frame)
178
+ self.scene = self.MVDPS()
179
+ refiner.to('cpu')
180
+
181
+ def __call__(self):
182
+ rgb_fn = self.cfg.scene.input.rgb
183
+ # coarse
184
+ self.scene = Gaussian_Scene(self.cfg)
185
+ # for trajectory genearation
186
+ self.n_sample = self.cfg.scene.traj.n_sample
187
+ self.traj_type = self.cfg.scene.traj.traj_type
188
+ self.scene.traj_type = self.cfg.scene.traj.traj_type
189
+ # for scene generation
190
+ self.opt_iters_per_frame = self.cfg.scene.gaussian.opt_iters_per_frame
191
+ self.outpaint_selections = self.cfg.scene.outpaint.outpaint_selections
192
+ self.outpaint_extend_times = self.cfg.scene.outpaint.outpaint_extend_times
193
+ # for scene refinement
194
+ self.mcs_n_view = self.cfg.scene.mcs.n_view
195
+ self.mcs_rect_w = self.cfg.scene.mcs.rect_w
196
+ self.mcs_iterations = self.cfg.scene.mcs.steps
197
+ self.mcs_gsopt_per_frame = self.cfg.scene.mcs.gsopt_iters
198
+ # coarse scene
199
+ self._resize_input(rgb_fn)
200
+ dir = rgb_fn[:str.rfind(rgb_fn,'/')]
201
+ rgb = Image.open(rgb_fn)
202
+ self._coarse_scene(rgb)
203
+ torch.cuda.empty_cache()
204
+ # refinement
205
+ self._MCS_Refinement()
206
+ torch.save(self.scene,f'{dir}/scene.pth')
207
+ self.checkor._render_video(self.scene,save_dir=f'{dir}/')
208
+
209
+
210
+
211
+
pipe/cfgs/INSTRUCT.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## INSTRUCTION
2
+
3
+ Here, we provide an explanation of some key parameters in ```pipe/cfgs/basic.yaml``` to facilitate parameter adjustments. For more details, please refer to our paper.
4
+
5
+ - ```scene.outpaint.outpaint_extend_times```
6
+ - This parameter controls the outpaint ratio of the image when constructing the global scaffold. A larger value will result in smoother scene boundaries, but it may also introduce distortion. A recommended range is between 0.3 and 0.6.
7
+
8
+ - ```scene.traj```
9
+ - ```.n_sample```: This parameter controls the number of warp-and-inpaint iterations. The more iterations, the higher the scene integrity (fewer holes). In most cases, a value of 10 is sufficient.
10
+ - ```.far_percentage``` / ```.traj_forward_ratio``` / ```.traj_backward_ratio```
11
+ - These parameters control the range of the camera's spiral trajectory (also the final scene) in ```ops/trajs```. Directly reconstruct a quite large scene might cause distortions.
12
+ - ```far_percentage``` controls the scale of the trajectory range. For large-scale scenes (especially those involving the sky or large windows), we recommend reducing this value. An example is in [this issue](https://github.com/WHU-USI3DV/VistaDream/issues/3).
13
+ - ```traj_forward_ratio``` and ```traj_backward_ratio``` control the forward and backward range of the camera, respectively.
14
+
15
+ - ```scene.mcs```
16
+ - ```.steps``` means the MCS refine steps. We suggest a value between 8 and 15.
17
+ - ```.n_view``` means the number of viewpoints optimized simultaneously in MCS. On a RTX4090 (24GB), 8 is feasible.
18
+ - ```.rect_w``` determines the MCS control strength. We suggest 0.3-0.8.
pipe/cfgs/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from omegaconf import OmegaConf
2
+
3
+ def load_cfg(cfg_path):
4
+ return OmegaConf.load(cfg_path)
5
+
6
+ def merge_cfgs(cfg1,cfg2):
7
+ cfg = OmegaConf.merge(cfg1,cfg2)
8
+ return cfg
pipe/cfgs/basic.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: basic
2
+
3
+ model:
4
+ sky:
5
+ value: 1e5 # to update
6
+ oneformer:
7
+ ckpt: 'tools/OneFormer/checkpoints/coco_pretrain_1280x1280_150_16_dinat_l_oneformer_ade20k_160k.pth'
8
+ yaml: 'tools/OneFormer/configs/ade20k/dinat/coco_pretrain_oneformer_dinat_large_bs16_160k_1280x1280.yaml'
9
+ vlm:
10
+ llava:
11
+ ckpt: 'llava-hf/bakLlava-v1-hf' # downloaded from hugging face
12
+ mde:
13
+ dpt_pro:
14
+ ckpt: 'tools/DepthPro/checkpoints/depth_pro.pt'
15
+ paint:
16
+ fooocus:
17
+ pass # it will load required checkpoints automaticly
18
+ optimize:
19
+ sd: 'sd-legacy/stable-diffusion-v1-5' # downloaded from hugging face
20
+ lcm: 'latent-consistency/lcm-lora-sdv1-5'
21
+
22
+ scene:
23
+ input:
24
+ rgb: 'data/sd_readingroom/color.png'
25
+ resize_long_edge: 512
26
+
27
+ outpaint:
28
+ outpaint_selections: ['Left','Right','Top','Bottom']
29
+ outpaint_extend_times: 0.45
30
+
31
+ traj:
32
+ n_sample: 10
33
+ traj_type: 'spiral'
34
+ near_percentage: 5
35
+ far_percentage: 95
36
+ traj_forward_ratio: 0.3
37
+ traj_backward_ratio: 0.7
38
+
39
+ gaussian:
40
+ opt_iters_per_frame: 512
41
+
42
+ mcs:
43
+ steps: 10 # among 50 total steps
44
+ n_view: 8
45
+ rect_w: 0.7
46
+ gsopt_iters: 256
47
+
pipe/lvm_inpaint.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ render using frames in GS
3
+ inpaint with fooocus
4
+ '''
5
+ import torch
6
+ import numpy as np
7
+ from ops.llava import Llava
8
+ from ops.gs.basic import Frame
9
+ # from ops.fooocus import Fooocus
10
+
11
+ class Fooocus():
12
+ def __init__(self):pass
13
+
14
+ class Inpaint_Tool():
15
+ def __init__(self,cfg) -> None:
16
+ self.cfg = cfg
17
+ self._load_model()
18
+
19
+ def _load_model(self):
20
+ self.fooocus = Fooocus()
21
+ self.llava = Llava(device='cpu',llava_ckpt=self.cfg.model.vlm.llava.ckpt)
22
+
23
+ def _llava_prompt(self,frame):
24
+ prompt = '<image>\n \
25
+ USER: Detaily imagine and describe the scene this image taken from? \
26
+ \n ASSISTANT: This image is taken from a scene of '
27
+ return prompt
28
+
29
+ def __call__(self, frame:Frame, outpaint_selections=[], outpaint_extend_times=0.0):
30
+ '''
31
+ Must be Frame type
32
+ '''
33
+ # conduct reconstuction
34
+ # ----------------------- LLaVA -----------------------
35
+ if frame.prompt is None:
36
+ print('Inpaint-Caption[1/3] Move llava.model to GPU...')
37
+ self.llava.model.to('cuda')
38
+ print('Inpaint-Caption[2/3] Llava inpainting instruction:')
39
+ query = self._llava_prompt(frame)
40
+ prompt = self.llava(frame.rgb,query)
41
+ split = str.rfind(prompt,'ASSISTANT: This image is taken from a scene of ') + len(f'ASSISTANT: This image is taken from a scene of ')
42
+ prompt = prompt[split:]
43
+ print(prompt)
44
+ print('Inpaint-Caption[3/3] Move llava.model to CPU...')
45
+ self.llava.model.to('cpu')
46
+ torch.cuda.empty_cache()
47
+ frame.prompt = prompt
48
+ else:
49
+ prompt = frame.prompt
50
+ print(f'Using pre-generated prompt: {prompt}')
51
+ # --------------------- Fooocus ----------------------
52
+ print('Inpaint-Fooocus[1/2] Fooocus inpainting...')
53
+ image = frame.rgb
54
+ mask = np.zeros_like(image,bool) if len(outpaint_selections)>0 else frame.inpaint
55
+ fooocus_result = self.fooocus(image_number=1,
56
+ prompt=prompt + ' 8K, no large circles, no cameras, no fisheye.',
57
+ negative_prompt='Any fisheye, any large circles, any blur, unrealism.',
58
+ outpaint_selections=outpaint_selections,
59
+ outpaint_extend_times=outpaint_extend_times,
60
+ origin_image=image,
61
+ mask_image=mask,)[0]
62
+ torch.cuda.empty_cache()
63
+
64
+ # reset the frame for outpainting
65
+ if len(outpaint_selections) > 0.:
66
+ assert len(outpaint_selections) == 4
67
+ small_H, small_W = frame.rgb.shape[0:2]
68
+ large_H, large_W = fooocus_result.shape[0:2]
69
+ if frame.intrinsic is not None:
70
+ # NO CHANGE TO FOCAL
71
+ frame.intrinsic[0,-1] = large_W//2
72
+ frame.intrinsic[1,-1] = large_H//2
73
+ # begin sample pixel
74
+ frame.H = large_H
75
+ frame.W = large_W
76
+ begin_H = (large_H-small_H)//2
77
+ begin_W = (large_W-small_W)//2
78
+ inpaint = np.ones_like(fooocus_result[...,0])
79
+ inpaint[begin_H:(begin_H+small_H),begin_W:(begin_W+small_W)] *= 0.
80
+ frame.inpaint = inpaint > 0.5
81
+ frame.rgb = fooocus_result
82
+
83
+ print('Inpaint-Fooocus[2/2] Assign Frame...')
84
+ return frame
85
+
pipe/reconstruct.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Dust3R reconstrucion
3
+ GeoWizard Estimation
4
+ Smooth Projection
5
+ '''
6
+ import torch
7
+ import PIL,cv2
8
+ import numpy as np
9
+ from PIL import Image
10
+ from ops.gs.basic import Frame
11
+ from ops.utils import *
12
+ from ops.depth_pro import Depth_Pro_Tool
13
+ from ops.connect import Smooth_Connect_Tool
14
+
15
+
16
+ class Reconstruct_Tool():
17
+ def __init__(self,cfg) -> None:
18
+ self.cfg = cfg
19
+ self._load_model()
20
+ self.connector = Smooth_Connect_Tool()
21
+
22
+ def _load_model(self):
23
+ self.pro_dpt = Depth_Pro_Tool(ckpt=self.cfg.model.mde.dpt_pro.ckpt,device='cpu')
24
+
25
+ def _ProDpt_(self, rgb, intrinsic=None):
26
+ # conduct reconstruction
27
+ print('Pro_dpt[1/3] Move Pro_dpt.model to GPU...')
28
+ self.pro_dpt.to('cuda')
29
+ print('Pro_dpt[2/3] Pro_dpt Estimation...')
30
+ f_px = intrinsic[0,0] if intrinsic is not None else None
31
+ metric_dpt,intrinsic = self.pro_dpt(rgb,f_px)
32
+ print('Pro_dpt[3/3] Move Pro_dpt.model to GPU...')
33
+ self.pro_dpt.to('cpu')
34
+ torch.cuda.empty_cache()
35
+ edge_mask = edge_filter(metric_dpt,times=0.05)
36
+ return metric_dpt, intrinsic, edge_mask
37
+
38
+ def _Guide_ProDpt_(self, rgb, intrinsic=None, refer_dpt=None, refer_msk=None):
39
+ # conduct reconstruction
40
+ print('Pro_dpt[1/3] Move Pro_dpt.model to GPU...')
41
+ self.pro_dpt.to('cuda')
42
+ print('Pro_dpt[2/3] Pro_dpt Estimation...')
43
+ f_px = intrinsic[0,0] if intrinsic is not None else None
44
+ metric_dpt,intrinsic = self.pro_dpt(rgb,f_px=f_px)
45
+ metric_dpt_connect = self.connector._affine_dpt_to_GS(refer_dpt,metric_dpt,~refer_msk)
46
+ print('Pro_dpt[3/3] Move Pro_dpt.model to GPU...')
47
+ self.pro_dpt.to('cpu')
48
+ torch.cuda.empty_cache()
49
+ edge_mask = edge_filter(metric_dpt_connect,times=0.05)
50
+ return metric_dpt_connect, metric_dpt, intrinsic, edge_mask
51
+
52
+ # ------------- TODO: Metricv2 + Guide-GeoWizard ------------------ #
pipe/refine_mvdps.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Coarse Gaussian Rendering -- RGB-D as init
3
+ RGB-D add noise (MV init)
4
+ Cycling:
5
+ denoise to x0 and d0 -- optimize Gaussian
6
+ re-rendering RGB-D
7
+ render RGB-D to rectified noise
8
+ noise rectification
9
+ step denoise with rectified noise
10
+ -- Finally the Gaussian
11
+ '''
12
+ import torch
13
+ import numpy as np
14
+ from copy import deepcopy
15
+ from ops.utils import *
16
+ from ops.gs.train import *
17
+ from ops.trajs import _generate_trajectory
18
+ from ops.gs.basic import Frame,Gaussian_Scene
19
+
20
+ class Refinement_Tool_MCS():
21
+ def __init__(self,
22
+ coarse_GS:Gaussian_Scene,
23
+ device = 'cuda',
24
+ refiner = None,
25
+ traj_type = 'spiral',
26
+ n_view = 8,
27
+ rect_w = 0.7,
28
+ n_gsopt_iters = 256) -> None:
29
+ # input coarse GS
30
+ # refine frames to be refined; here we refine frames rather than gaussian paras
31
+ self.n_view = n_view
32
+ self.rect_w = rect_w
33
+ self.n_gsopt_iters = n_gsopt_iters
34
+ self.coarse_GS = coarse_GS
35
+ self.refine_frames: list[Frame] = []
36
+ # hyperparameters total is 50 steps and here is the last N steps
37
+ self.process_res = 512
38
+ self.device = device
39
+ self.traj_type = traj_type
40
+ # models
41
+ self.RGB_LCM = refiner
42
+ self.RGB_LCM.to('cuda')
43
+ self.steps = self.RGB_LCM.denoise_steps
44
+ # prompt for diffusion
45
+ prompt = self.coarse_GS.frames[-1].prompt
46
+ self.rgb_prompt_latent = self.RGB_LCM.model._encode_text_prompt(prompt)
47
+ # loss function
48
+ self.rgb_lossfunc = RGB_Loss(w_ssim=0.2)
49
+
50
+ def _pre_process(self):
51
+ # determine the diffusion target shape
52
+ strict_times = 32
53
+ origin_H = self.coarse_GS.frames[0].H
54
+ origin_W = self.coarse_GS.frames[0].W
55
+ self.target_H,self.target_W = self.process_res,self.process_res
56
+ # reshape to the same (target) shape for rendering and denoising
57
+ intrinsic = deepcopy(self.coarse_GS.frames[0].intrinsic)
58
+ H_ratio, W_ratio = self.target_H/origin_H, self.target_W/origin_W
59
+ intrinsic[0] *= W_ratio
60
+ intrinsic[1] *= H_ratio
61
+ target_H, target_W = self.target_H+2*strict_times, self.target_W+2*strict_times
62
+ intrinsic[0,-1] = target_W/2
63
+ intrinsic[1,-1] = target_H/2
64
+ # generate a set of cameras
65
+ trajs = _generate_trajectory(None,self.coarse_GS,nframes=self.n_view+2)[1:-1]
66
+ for i, pose in enumerate(trajs):
67
+ fine_frame = Frame()
68
+ fine_frame.H = target_H
69
+ fine_frame.W = target_W
70
+ fine_frame.extrinsic = pose
71
+ fine_frame.intrinsic = deepcopy(intrinsic)
72
+ fine_frame.prompt = self.coarse_GS.frames[-1].prompt
73
+ self.refine_frames.append(fine_frame)
74
+ # determine inpaint mask
75
+ temp_scene = Gaussian_Scene()
76
+ temp_scene._add_trainable_frame(self.coarse_GS.frames[0],require_grad=False)
77
+ temp_scene._add_trainable_frame(self.coarse_GS.frames[1],require_grad=False)
78
+ for frame in self.refine_frames:
79
+ frame = temp_scene._render_for_inpaint(frame)
80
+
81
+ def _mv_init(self):
82
+ rgbs = []
83
+ # only for inpainted images
84
+ for frame in self.refine_frames:
85
+ # rendering at now; all in the same shape
86
+ render_rgb,render_dpt,render_alpha=self.coarse_GS._render_RGBD(frame)
87
+ # diffusion images
88
+ rgbs.append(render_rgb.permute(2,0,1)[None])
89
+ self.rgbs = torch.cat(rgbs,dim=0)
90
+ self.RGB_LCM._encode_mv_init_images(self.rgbs)
91
+
92
+ def _to_cuda(self,tensor):
93
+ tensor = torch.from_numpy(tensor.astype(np.float32)).to('cuda')
94
+ return tensor
95
+
96
+ def _x0_rectification(self, denoise_rgb, iters):
97
+ # gaussian initialization
98
+ CGS = deepcopy(self.coarse_GS)
99
+ for gf in CGS.gaussian_frames:
100
+ gf._require_grad(True)
101
+ self.refine_GS = GS_Train_Tool(CGS)
102
+ # rectification
103
+ for iter in range(iters):
104
+ loss = 0.
105
+ # supervise on input view
106
+ for i in range(2):
107
+ keep_frame :Frame = self.coarse_GS.frames[i]
108
+ render_rgb,render_dpt,render_alpha = self.refine_GS._render(keep_frame)
109
+ loss_rgb = self.rgb_lossfunc(render_rgb,self._to_cuda(keep_frame.rgb),valid_mask=keep_frame.inpaint)
110
+ loss += loss_rgb*len(self.refine_frames)
111
+ # then multiview supervision
112
+ for i,frame in enumerate(self.refine_frames):
113
+ render_rgb,render_dpt,render_alpha = self.refine_GS._render(frame)
114
+ loss_rgb_item = self.rgb_lossfunc(denoise_rgb[i],render_rgb)
115
+ loss += loss_rgb_item
116
+ # optimization
117
+ loss.backward()
118
+ self.refine_GS.optimizer.step()
119
+ self.refine_GS.optimizer.zero_grad()
120
+
121
+ def _step_gaussian_optimization(self,step):
122
+ # denoise to x0 and d0
123
+ with torch.no_grad():
124
+ # we left the last 2 steps for stronger guidances
125
+ rgb_t = self.RGB_LCM.timesteps[-self.steps+step]
126
+ rgb_t = torch.tensor([rgb_t]).to(self.device)
127
+ rgb_noise_pr,rgb_denoise = self.RGB_LCM._denoise_to_x0(rgb_t,self.rgb_prompt_latent)
128
+ rgb_denoise = rgb_denoise.permute(0,2,3,1)
129
+ # rendering each frames and weight-able refinement
130
+ self._x0_rectification(rgb_denoise,self.n_gsopt_iters)
131
+ return rgb_t, rgb_noise_pr
132
+
133
+ def _step_diffusion_rectification(self, rgb_t, rgb_noise_pr):
134
+ # re-rendering RGB
135
+ with torch.no_grad():
136
+ x0_rect = []
137
+ for i,frame in enumerate(self.refine_frames):
138
+ re_render_rgb,_,re_render_alpha= self.refine_GS._render(frame)
139
+ # avoid rasterization holes yield more block holes and more
140
+ x0_rect.append(re_render_rgb.permute(2,0,1)[None])
141
+ x0_rect = torch.cat(x0_rect,dim=0)
142
+ # rectification
143
+ self.RGB_LCM._step_denoise(rgb_t,rgb_noise_pr,x0_rect,rect_w=self.rect_w)
144
+
145
+ def __call__(self):
146
+ # warmup
147
+ self._pre_process()
148
+ self._mv_init()
149
+ for step in tqdm.tqdm(range(self.steps)):
150
+ rgb_t, rgb_noise_pr = self._step_gaussian_optimization(step)
151
+ self._step_diffusion_rectification(rgb_t, rgb_noise_pr)
152
+ scene = self.refine_GS.GS
153
+ for gf in scene.gaussian_frames:
154
+ gf._require_grad(False)
155
+ return scene
requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch == 2.0.1
2
+ torchvision == 0.15.2
3
+ xformers == 0.0.21
4
+ numpy
5
+ regex
6
+ torchmetrics
7
+ accelerate
8
+ gsplat
9
+ open3d
10
+ tqdm
11
+ omegaconf
12
+ opencv-python
13
+ opencv-contrib-python
14
+ plyfile
15
+ timm
16
+ wandb
17
+ ftfy
18
+ pillow_heif
19
+ diffdist
20
+ diffusers
21
+ einops
22
+ imageio
23
+ imageio-ffmpeg
24
+ transformers
25
+ torchsde
26
+ huggingface-hub
27
+
28
+
tools/DepthPro/ACKNOWLEDGEMENTS.md ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Acknowledgements
2
+ Portions of this Software may utilize the following copyrighted
3
+ material, the use of which is hereby acknowledged.
4
+
5
+ ------------------------------------------------
6
+ PyTorch Image Models (timm)
7
+ Ross Wightman
8
+
9
+ Apache License
10
+ Version 2.0, January 2004
11
+ http://www.apache.org/licenses/
12
+
13
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
14
+
15
+ 1. Definitions.
16
+
17
+ "License" shall mean the terms and conditions for use, reproduction,
18
+ and distribution as defined by Sections 1 through 9 of this document.
19
+
20
+ "Licensor" shall mean the copyright owner or entity authorized by
21
+ the copyright owner that is granting the License.
22
+
23
+ "Legal Entity" shall mean the union of the acting entity and all
24
+ other entities that control, are controlled by, or are under common
25
+ control with that entity. For the purposes of this definition,
26
+ "control" means (i) the power, direct or indirect, to cause the
27
+ direction or management of such entity, whether by contract or
28
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
29
+ outstanding shares, or (iii) beneficial ownership of such entity.
30
+
31
+ "You" (or "Your") shall mean an individual or Legal Entity
32
+ exercising permissions granted by this License.
33
+
34
+ "Source" form shall mean the preferred form for making modifications,
35
+ including but not limited to software source code, documentation
36
+ source, and configuration files.
37
+
38
+ "Object" form shall mean any form resulting from mechanical
39
+ transformation or translation of a Source form, including but
40
+ not limited to compiled object code, generated documentation,
41
+ and conversions to other media types.
42
+
43
+ "Work" shall mean the work of authorship, whether in Source or
44
+ Object form, made available under the License, as indicated by a
45
+ copyright notice that is included in or attached to the work
46
+ (an example is provided in the Appendix below).
47
+
48
+ "Derivative Works" shall mean any work, whether in Source or Object
49
+ form, that is based on (or derived from) the Work and for which the
50
+ editorial revisions, annotations, elaborations, or other modifications
51
+ represent, as a whole, an original work of authorship. For the purposes
52
+ of this License, Derivative Works shall not include works that remain
53
+ separable from, or merely link (or bind by name) to the interfaces of,
54
+ the Work and Derivative Works thereof.
55
+
56
+ "Contribution" shall mean any work of authorship, including
57
+ the original version of the Work and any modifications or additions
58
+ to that Work or Derivative Works thereof, that is intentionally
59
+ submitted to Licensor for inclusion in the Work by the copyright owner
60
+ or by an individual or Legal Entity authorized to submit on behalf of
61
+ the copyright owner. For the purposes of this definition, "submitted"
62
+ means any form of electronic, verbal, or written communication sent
63
+ to the Licensor or its representatives, including but not limited to
64
+ communication on electronic mailing lists, source code control systems,
65
+ and issue tracking systems that are managed by, or on behalf of, the
66
+ Licensor for the purpose of discussing and improving the Work, but
67
+ excluding communication that is conspicuously marked or otherwise
68
+ designated in writing by the copyright owner as "Not a Contribution."
69
+
70
+ "Contributor" shall mean Licensor and any individual or Legal Entity
71
+ on behalf of whom a Contribution has been received by Licensor and
72
+ subsequently incorporated within the Work.
73
+
74
+ 2. Grant of Copyright License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ copyright license to reproduce, prepare Derivative Works of,
78
+ publicly display, publicly perform, sublicense, and distribute the
79
+ Work and such Derivative Works in Source or Object form.
80
+
81
+ 3. Grant of Patent License. Subject to the terms and conditions of
82
+ this License, each Contributor hereby grants to You a perpetual,
83
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
84
+ (except as stated in this section) patent license to make, have made,
85
+ use, offer to sell, sell, import, and otherwise transfer the Work,
86
+ where such license applies only to those patent claims licensable
87
+ by such Contributor that are necessarily infringed by their
88
+ Contribution(s) alone or by combination of their Contribution(s)
89
+ with the Work to which such Contribution(s) was submitted. If You
90
+ institute patent litigation against any entity (including a
91
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
92
+ or a Contribution incorporated within the Work constitutes direct
93
+ or contributory patent infringement, then any patent licenses
94
+ granted to You under this License for that Work shall terminate
95
+ as of the date such litigation is filed.
96
+
97
+ 4. Redistribution. You may reproduce and distribute copies of the
98
+ Work or Derivative Works thereof in any medium, with or without
99
+ modifications, and in Source or Object form, provided that You
100
+ meet the following conditions:
101
+
102
+ (a) You must give any other recipients of the Work or
103
+ Derivative Works a copy of this License; and
104
+
105
+ (b) You must cause any modified files to carry prominent notices
106
+ stating that You changed the files; and
107
+
108
+ (c) You must retain, in the Source form of any Derivative Works
109
+ that You distribute, all copyright, patent, trademark, and
110
+ attribution notices from the Source form of the Work,
111
+ excluding those notices that do not pertain to any part of
112
+ the Derivative Works; and
113
+
114
+ (d) If the Work includes a "NOTICE" text file as part of its
115
+ distribution, then any Derivative Works that You distribute must
116
+ include a readable copy of the attribution notices contained
117
+ within such NOTICE file, excluding those notices that do not
118
+ pertain to any part of the Derivative Works, in at least one
119
+ of the following places: within a NOTICE text file distributed
120
+ as part of the Derivative Works; within the Source form or
121
+ documentation, if provided along with the Derivative Works; or,
122
+ within a display generated by the Derivative Works, if and
123
+ wherever such third-party notices normally appear. The contents
124
+ of the NOTICE file are for informational purposes only and
125
+ do not modify the License. You may add Your own attribution
126
+ notices within Derivative Works that You distribute, alongside
127
+ or as an addendum to the NOTICE text from the Work, provided
128
+ that such additional attribution notices cannot be construed
129
+ as modifying the License.
130
+
131
+ You may add Your own copyright statement to Your modifications and
132
+ may provide additional or different license terms and conditions
133
+ for use, reproduction, or distribution of Your modifications, or
134
+ for any such Derivative Works as a whole, provided Your use,
135
+ reproduction, and distribution of the Work otherwise complies with
136
+ the conditions stated in this License.
137
+
138
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
139
+ any Contribution intentionally submitted for inclusion in the Work
140
+ by You to the Licensor shall be under the terms and conditions of
141
+ this License, without any additional terms or conditions.
142
+ Notwithstanding the above, nothing herein shall supersede or modify
143
+ the terms of any separate license agreement you may have executed
144
+ with Licensor regarding such Contributions.
145
+
146
+ 6. Trademarks. This License does not grant permission to use the trade
147
+ names, trademarks, service marks, or product names of the Licensor,
148
+ except as required for reasonable and customary use in describing the
149
+ origin of the Work and reproducing the content of the NOTICE file.
150
+
151
+ 7. Disclaimer of Warranty. Unless required by applicable law or
152
+ agreed to in writing, Licensor provides the Work (and each
153
+ Contributor provides its Contributions) on an "AS IS" BASIS,
154
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
155
+ implied, including, without limitation, any warranties or conditions
156
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
157
+ PARTICULAR PURPOSE. You are solely responsible for determining the
158
+ appropriateness of using or redistributing the Work and assume any
159
+ risks associated with Your exercise of permissions under this License.
160
+
161
+ 8. Limitation of Liability. In no event and under no legal theory,
162
+ whether in tort (including negligence), contract, or otherwise,
163
+ unless required by applicable law (such as deliberate and grossly
164
+ negligent acts) or agreed to in writing, shall any Contributor be
165
+ liable to You for damages, including any direct, indirect, special,
166
+ incidental, or consequential damages of any character arising as a
167
+ result of this License or out of the use or inability to use the
168
+ Work (including but not limited to damages for loss of goodwill,
169
+ work stoppage, computer failure or malfunction, or any and all
170
+ other commercial damages or losses), even if such Contributor
171
+ has been advised of the possibility of such damages.
172
+
173
+ 9. Accepting Warranty or Additional Liability. While redistributing
174
+ the Work or Derivative Works thereof, You may choose to offer,
175
+ and charge a fee for, acceptance of support, warranty, indemnity,
176
+ or other liability obligations and/or rights consistent with this
177
+ License. However, in accepting such obligations, You may act only
178
+ on Your own behalf and on Your sole responsibility, not on behalf
179
+ of any other Contributor, and only if You agree to indemnify,
180
+ defend, and hold each Contributor harmless for any liability
181
+ incurred by, or claims asserted against, such Contributor by reason
182
+ of your accepting any such warranty or additional liability.
183
+
184
+ END OF TERMS AND CONDITIONS
185
+
186
+ APPENDIX: How to apply the Apache License to your work.
187
+
188
+ To apply the Apache License to your work, attach the following
189
+ boilerplate notice, with the fields enclosed by brackets "{}"
190
+ replaced with your own identifying information. (Don't include
191
+ the brackets!) The text should be enclosed in the appropriate
192
+ comment syntax for the file format. We also recommend that a
193
+ file or class name and description of purpose be included on the
194
+ same "printed page" as the copyright notice for easier
195
+ identification within third-party archives.
196
+
197
+ Copyright 2019 Ross Wightman
198
+
199
+ Licensed under the Apache License, Version 2.0 (the "License");
200
+ you may not use this file except in compliance with the License.
201
+ You may obtain a copy of the License at
202
+
203
+ http://www.apache.org/licenses/LICENSE-2.0
204
+
205
+ Unless required by applicable law or agreed to in writing, software
206
+ distributed under the License is distributed on an "AS IS" BASIS,
207
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
208
+ See the License for the specific language governing permissions and
209
+ limitations under the License.
210
+
211
+
212
+ ------------------------------------------------
213
+ DINOv2: Learning Robust Visual Features without Supervision
214
+ Github source: https://github.com/facebookresearch/dinov2
215
+
216
+
217
+
218
+ Apache License
219
+ Version 2.0, January 2004
220
+ http://www.apache.org/licenses/
221
+
222
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
223
+
224
+ 1. Definitions.
225
+
226
+ "License" shall mean the terms and conditions for use, reproduction,
227
+ and distribution as defined by Sections 1 through 9 of this document.
228
+
229
+ "Licensor" shall mean the copyright owner or entity authorized by
230
+ the copyright owner that is granting the License.
231
+
232
+ "Legal Entity" shall mean the union of the acting entity and all
233
+ other entities that control, are controlled by, or are under common
234
+ control with that entity. For the purposes of this definition,
235
+ "control" means (i) the power, direct or indirect, to cause the
236
+ direction or management of such entity, whether by contract or
237
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
238
+ outstanding shares, or (iii) beneficial ownership of such entity.
239
+
240
+ "You" (or "Your") shall mean an individual or Legal Entity
241
+ exercising permissions granted by this License.
242
+
243
+ "Source" form shall mean the preferred form for making modifications,
244
+ including but not limited to software source code, documentation
245
+ source, and configuration files.
246
+
247
+ "Object" form shall mean any form resulting from mechanical
248
+ transformation or translation of a Source form, including but
249
+ not limited to compiled object code, generated documentation,
250
+ and conversions to other media types.
251
+
252
+ "Work" shall mean the work of authorship, whether in Source or
253
+ Object form, made available under the License, as indicated by a
254
+ copyright notice that is included in or attached to the work
255
+ (an example is provided in the Appendix below).
256
+
257
+ "Derivative Works" shall mean any work, whether in Source or Object
258
+ form, that is based on (or derived from) the Work and for which the
259
+ editorial revisions, annotations, elaborations, or other modifications
260
+ represent, as a whole, an original work of authorship. For the purposes
261
+ of this License, Derivative Works shall not include works that remain
262
+ separable from, or merely link (or bind by name) to the interfaces of,
263
+ the Work and Derivative Works thereof.
264
+
265
+ "Contribution" shall mean any work of authorship, including
266
+ the original version of the Work and any modifications or additions
267
+ to that Work or Derivative Works thereof, that is intentionally
268
+ submitted to Licensor for inclusion in the Work by the copyright owner
269
+ or by an individual or Legal Entity authorized to submit on behalf of
270
+ the copyright owner. For the purposes of this definition, "submitted"
271
+ means any form of electronic, verbal, or written communication sent
272
+ to the Licensor or its representatives, including but not limited to
273
+ communication on electronic mailing lists, source code control systems,
274
+ and issue tracking systems that are managed by, or on behalf of, the
275
+ Licensor for the purpose of discussing and improving the Work, but
276
+ excluding communication that is conspicuously marked or otherwise
277
+ designated in writing by the copyright owner as "Not a Contribution."
278
+
279
+ "Contributor" shall mean Licensor and any individual or Legal Entity
280
+ on behalf of whom a Contribution has been received by Licensor and
281
+ subsequently incorporated within the Work.
282
+
283
+ 2. Grant of Copyright License. Subject to the terms and conditions of
284
+ this License, each Contributor hereby grants to You a perpetual,
285
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
286
+ copyright license to reproduce, prepare Derivative Works of,
287
+ publicly display, publicly perform, sublicense, and distribute the
288
+ Work and such Derivative Works in Source or Object form.
289
+
290
+ 3. Grant of Patent License. Subject to the terms and conditions of
291
+ this License, each Contributor hereby grants to You a perpetual,
292
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
293
+ (except as stated in this section) patent license to make, have made,
294
+ use, offer to sell, sell, import, and otherwise transfer the Work,
295
+ where such license applies only to those patent claims licensable
296
+ by such Contributor that are necessarily infringed by their
297
+ Contribution(s) alone or by combination of their Contribution(s)
298
+ with the Work to which such Contribution(s) was submitted. If You
299
+ institute patent litigation against any entity (including a
300
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
301
+ or a Contribution incorporated within the Work constitutes direct
302
+ or contributory patent infringement, then any patent licenses
303
+ granted to You under this License for that Work shall terminate
304
+ as of the date such litigation is filed.
305
+
306
+ 4. Redistribution. You may reproduce and distribute copies of the
307
+ Work or Derivative Works thereof in any medium, with or without
308
+ modifications, and in Source or Object form, provided that You
309
+ meet the following conditions:
310
+
311
+ (a) You must give any other recipients of the Work or
312
+ Derivative Works a copy of this License; and
313
+
314
+ (b) You must cause any modified files to carry prominent notices
315
+ stating that You changed the files; and
316
+
317
+ (c) You must retain, in the Source form of any Derivative Works
318
+ that You distribute, all copyright, patent, trademark, and
319
+ attribution notices from the Source form of the Work,
320
+ excluding those notices that do not pertain to any part of
321
+ the Derivative Works; and
322
+
323
+ (d) If the Work includes a "NOTICE" text file as part of its
324
+ distribution, then any Derivative Works that You distribute must
325
+ include a readable copy of the attribution notices contained
326
+ within such NOTICE file, excluding those notices that do not
327
+ pertain to any part of the Derivative Works, in at least one
328
+ of the following places: within a NOTICE text file distributed
329
+ as part of the Derivative Works; within the Source form or
330
+ documentation, if provided along with the Derivative Works; or,
331
+ within a display generated by the Derivative Works, if and
332
+ wherever such third-party notices normally appear. The contents
333
+ of the NOTICE file are for informational purposes only and
334
+ do not modify the License. You may add Your own attribution
335
+ notices within Derivative Works that You distribute, alongside
336
+ or as an addendum to the NOTICE text from the Work, provided
337
+ that such additional attribution notices cannot be construed
338
+ as modifying the License.
339
+
340
+ You may add Your own copyright statement to Your modifications and
341
+ may provide additional or different license terms and conditions
342
+ for use, reproduction, or distribution of Your modifications, or
343
+ for any such Derivative Works as a whole, provided Your use,
344
+ reproduction, and distribution of the Work otherwise complies with
345
+ the conditions stated in this License.
346
+
347
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
348
+ any Contribution intentionally submitted for inclusion in the Work
349
+ by You to the Licensor shall be under the terms and conditions of
350
+ this License, without any additional terms or conditions.
351
+ Notwithstanding the above, nothing herein shall supersede or modify
352
+ the terms of any separate license agreement you may have executed
353
+ with Licensor regarding such Contributions.
354
+
355
+ 6. Trademarks. This License does not grant permission to use the trade
356
+ names, trademarks, service marks, or product names of the Licensor,
357
+ except as required for reasonable and customary use in describing the
358
+ origin of the Work and reproducing the content of the NOTICE file.
359
+
360
+ 7. Disclaimer of Warranty. Unless required by applicable law or
361
+ agreed to in writing, Licensor provides the Work (and each
362
+ Contributor provides its Contributions) on an "AS IS" BASIS,
363
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
364
+ implied, including, without limitation, any warranties or conditions
365
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
366
+ PARTICULAR PURPOSE. You are solely responsible for determining the
367
+ appropriateness of using or redistributing the Work and assume any
368
+ risks associated with Your exercise of permissions under this License.
369
+
370
+ 8. Limitation of Liability. In no event and under no legal theory,
371
+ whether in tort (including negligence), contract, or otherwise,
372
+ unless required by applicable law (such as deliberate and grossly
373
+ negligent acts) or agreed to in writing, shall any Contributor be
374
+ liable to You for damages, including any direct, indirect, special,
375
+ incidental, or consequential damages of any character arising as a
376
+ result of this License or out of the use or inability to use the
377
+ Work (including but not limited to damages for loss of goodwill,
378
+ work stoppage, computer failure or malfunction, or any and all
379
+ other commercial damages or losses), even if such Contributor
380
+ has been advised of the possibility of such damages.
381
+
382
+ 9. Accepting Warranty or Additional Liability. While redistributing
383
+ the Work or Derivative Works thereof, You may choose to offer,
384
+ and charge a fee for, acceptance of support, warranty, indemnity,
385
+ or other liability obligations and/or rights consistent with this
386
+ License. However, in accepting such obligations, You may act only
387
+ on Your own behalf and on Your sole responsibility, not on behalf
388
+ of any other Contributor, and only if You agree to indemnify,
389
+ defend, and hold each Contributor harmless for any liability
390
+ incurred by, or claims asserted against, such Contributor by reason
391
+ of your accepting any such warranty or additional liability.
392
+
393
+ END OF TERMS AND CONDITIONS
394
+
395
+ APPENDIX: How to apply the Apache License to your work.
396
+
397
+ To apply the Apache License to your work, attach the following
398
+ boilerplate notice, with the fields enclosed by brackets "[]"
399
+ replaced with your own identifying information. (Don't include
400
+ the brackets!) The text should be enclosed in the appropriate
401
+ comment syntax for the file format. We also recommend that a
402
+ file or class name and description of purpose be included on the
403
+ same "printed page" as the copyright notice for easier
404
+ identification within third-party archives.
405
+
406
+ Copyright [yyyy] [name of copyright owner]
407
+
408
+ Licensed under the Apache License, Version 2.0 (the "License");
409
+ you may not use this file except in compliance with the License.
410
+ You may obtain a copy of the License at
411
+
412
+ http://www.apache.org/licenses/LICENSE-2.0
413
+
414
+ Unless required by applicable law or agreed to in writing, software
415
+ distributed under the License is distributed on an "AS IS" BASIS,
416
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
417
+ See the License for the specific language governing permissions and
418
+ limitations under the License.
tools/DepthPro/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, sex characteristics, gender identity and expression,
9
+ level of experience, education, socio-economic status, nationality, personal
10
+ appearance, race, religion, or sexual identity and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies within all project spaces, and it also applies when
49
+ an individual is representing the project or its community in public spaces.
50
+ Examples of representing a project or community include using an official
51
+ project e-mail address, posting via an official social media account, or acting
52
+ as an appointed representative at an online or offline event. Representation of
53
+ a project may be further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the open source team at [[email protected]](mailto:[email protected]). All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4,
71
+ available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html)
tools/DepthPro/CONTRIBUTING.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contribution Guide
2
+
3
+ Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducibility, and beyond its publication there are limited plans for future development of the repository.
4
+
5
+ While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged.
6
+
7
+ ## Before you get started
8
+
9
+ By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE).
10
+
11
+ We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md).
tools/DepthPro/LICENSE ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (C) 2024 Apple Inc. All Rights Reserved.
2
+
3
+ Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple
4
+ Inc. ("Apple") in consideration of your agreement to the following
5
+ terms, and your use, installation, modification or redistribution of
6
+ this Apple software constitutes acceptance of these terms. If you do
7
+ not agree with these terms, please do not use, install, modify or
8
+ redistribute this Apple software.
9
+
10
+ In consideration of your agreement to abide by the following terms, and
11
+ subject to these terms, Apple grants you a personal, non-exclusive
12
+ license, under Apple's copyrights in this original Apple software (the
13
+ "Apple Software"), to use, reproduce, modify and redistribute the Apple
14
+ Software, with or without modifications, in source and/or binary forms;
15
+ provided that if you redistribute the Apple Software in its entirety and
16
+ without modifications, you must retain this notice and the following
17
+ text and disclaimers in all such redistributions of the Apple Software.
18
+ Neither the name, trademarks, service marks or logos of Apple Inc. may
19
+ be used to endorse or promote products derived from the Apple Software
20
+ without specific prior written permission from Apple. Except as
21
+ expressly stated in this notice, no other rights or licenses, express or
22
+ implied, are granted by Apple herein, including but not limited to any
23
+ patent rights that may be infringed by your derivative works or by other
24
+ works in which the Apple Software may be incorporated.
25
+
26
+ The Apple Software is provided by Apple on an "AS IS" basis. APPLE
27
+ MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
28
+ THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
29
+ FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
30
+ OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
31
+
32
+ IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
33
+ OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35
+ INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
36
+ MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
37
+ AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
38
+ STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
39
+ POSSIBILITY OF SUCH DAMAGE.
40
+
41
+
42
+ -------------------------------------------------------------------------------
43
+ SOFTWARE DISTRIBUTED IN THIS REPOSITORY:
44
+
45
+ This software includes a number of subcomponents with separate
46
+ copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
47
+ -------------------------------------------------------------------------------
tools/DepthPro/README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Depth Pro: Sharp Monocular Metric Depth in Less Than a Second
2
+
3
+ This software project accompanies the research paper:
4
+ **[Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073)**,
5
+ *Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, and Vladlen Koltun*.
6
+
7
+ ![](data/depth-pro-teaser.jpg)
8
+
9
+ We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image.
10
+
11
+
12
+ The model in this repository is a reference implementation, which has been re-trained. Its performance is close to the model reported in the paper but does not match it exactly.
13
+
14
+ ## Getting Started
15
+
16
+ We recommend setting up a virtual environment. Using e.g. miniconda, the `depth_pro` package can be installed via:
17
+
18
+ ```bash
19
+ conda create -n depth-pro -y python=3.9
20
+ conda activate depth-pro
21
+
22
+ pip install -e .
23
+ ```
24
+
25
+ To download pretrained checkpoints follow the code snippet below:
26
+ ```bash
27
+ source get_pretrained_models.sh # Files will be downloaded to `checkpoints` directory.
28
+ ```
29
+
30
+ ### Running from commandline
31
+
32
+ We provide a helper script to directly run the model on a single image:
33
+ ```bash
34
+ # Run prediction on a single image:
35
+ depth-pro-run -i ./data/example.jpg
36
+ # Run `depth-pro-run -h` for available options.
37
+ ```
38
+
39
+ ### Running from python
40
+
41
+ ```python
42
+ from PIL import Image
43
+ import depth_pro
44
+
45
+ # Load model and preprocessing transform
46
+ model, transform = depth_pro.create_model_and_transforms()
47
+ model.eval()
48
+
49
+ # Load and preprocess an image.
50
+ image, _, f_px = depth_pro.load_rgb(image_path)
51
+ image = transform(image)
52
+
53
+ # Run inference.
54
+ prediction = model.infer(image, f_px=f_px)
55
+ depth = prediction["depth"] # Depth in [m].
56
+ focallength_px = prediction["focallength_px"] # Focal length in pixels.
57
+ ```
58
+
59
+
60
+ ### Evaluation (boundary metrics)
61
+
62
+ Our boundary metrics can be found under `eval/boundary_metrics.py` and used as follows:
63
+
64
+ ```python
65
+ # for a depth-based dataset
66
+ boundary_f1 = SI_boundary_F1(predicted_depth, target_depth)
67
+
68
+ # for a mask-based dataset (image matting / segmentation)
69
+ boundary_recall = SI_boundary_Recall(predicted_depth, target_mask)
70
+ ```
71
+
72
+
73
+ ## Citation
74
+
75
+ If you find our work useful, please cite the following paper:
76
+
77
+ ```bibtex
78
+ @article{Bochkovskii2024:arxiv,
79
+ author = {Aleksei Bochkovskii and Ama\"{e}l Delaunoy and Hugo Germain and Marcel Santos and
80
+ Yichao Zhou and Stephan R. Richter and Vladlen Koltun}
81
+ title = {Depth Pro: Sharp Monocular Metric Depth in Less Than a Second},
82
+ journal = {arXiv},
83
+ year = {2024},
84
+ url = {https://arxiv.org/abs/2410.02073},
85
+ }
86
+ ```
87
+
88
+ ## License
89
+ This sample code is released under the [LICENSE](LICENSE) terms.
90
+
91
+ The model weights are released under the [LICENSE](LICENSE) terms.
92
+
93
+ ## Acknowledgements
94
+
95
+ Our codebase is built using multiple opensource contributions, please see [Acknowledgements](ACKNOWLEDGEMENTS.md) for more details.
96
+
97
+ Please check the paper for a complete list of references and datasets used in this work.
tools/DepthPro/command_pro_dpt.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import src.depth_pro as depth_pro
2
+ import numpy as np
3
+ from PIL import Image
4
+ from src.depth_pro.depth_pro import DepthProConfig
5
+
6
+ class apple_pro_depth():
7
+ def __init__(self,device='cuda',ckpt = '/mnt/proj/SOTAs/ml-depth-pro-main/checkpoints/depth_pro.pt'):
8
+ self.ckpt = ckpt
9
+ self.device = device
10
+ self._load_model()
11
+
12
+ def _load_model(self):
13
+ cfg = DepthProConfig(
14
+ patch_encoder_preset="dinov2l16_384",
15
+ image_encoder_preset="dinov2l16_384",
16
+ checkpoint_uri=self.ckpt,
17
+ decoder_features=256,
18
+ use_fov_head=True,
19
+ fov_encoder_preset="dinov2l16_384",
20
+ )
21
+ self.model, self.transform = depth_pro.create_model_and_transforms(config=cfg,device=self.device)
22
+ self.model.eval()
23
+
24
+ def get_intrins(self, f, H, W):
25
+ new_cu = (W / 2.0) - 0.5
26
+ new_cv = (H / 2.0) - 0.5
27
+ intrins = np.array([
28
+ [f, 0, new_cu ],
29
+ [0, f, new_cv ],
30
+ [0, 0, 1 ]
31
+ ])
32
+ return intrins
33
+
34
+ def to(self,device):
35
+ self.device = device
36
+ self.model.to(device)
37
+
38
+ def __call__(self, image,f_px=None):
39
+ if type(image) is np.ndarray:
40
+ if np.amax(image) < 1.1:
41
+ image = image*255
42
+ image = Image.fromarray(image.astype(np.uint8))
43
+ # trans
44
+ image = self.transform(image).to(self.device)
45
+ # predict
46
+ prediction = self.model.infer(image, f_px=f_px)
47
+ depth = prediction["depth"] # Depth in [m].
48
+ focallength_px = prediction["focallength_px"] # Focal length in pixels.
49
+ # output
50
+ H,W = depth.shape[0:2]
51
+ depth = depth.detach().cpu().numpy()
52
+ focallength_px = focallength_px.detach().cpu().numpy() if f_px is None else f_px
53
+ intrisnc = self.get_intrins(focallength_px,H,W)
54
+ return depth, intrisnc
tools/DepthPro/get_pretrained_models.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # For licensing see accompanying LICENSE file.
4
+ # Copyright (C) 2024 Apple Inc. All Rights Reserved.
5
+ #
6
+ mkdir -p checkpoints
7
+ # Place final weights here:
8
+ wget https://ml-site.cdn-apple.com/models/depth-pro/depth_pro.pt -P checkpoints
tools/DepthPro/pyproject.toml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "depth_pro"
3
+ version = "0.1"
4
+ description = "Inference/Network/Model code for Apple Depth Pro monocular depth estimation."
5
+ readme = "README.md"
6
+ dependencies = [
7
+ "torch",
8
+ "torchvision",
9
+ "timm",
10
+ "numpy<2",
11
+ "pillow_heif",
12
+ "matplotlib",
13
+ ]
14
+
15
+ [project.scripts]
16
+ depth-pro-run = "depth_pro.cli:run_main"
17
+
18
+ [project.urls]
19
+ Homepage = "https://github.com/apple/ml-depth-pro"
20
+ Repository = "https://github.com/apple/ml-depth-pro"
21
+
22
+ [build-system]
23
+ requires = ["setuptools", "setuptools-scm"]
24
+ build-backend = "setuptools.build_meta"
25
+
26
+ [tool.setuptools.packages.find]
27
+ where = ["src"]
28
+
29
+ [tool.pyright]
30
+ include = ["src"]
31
+ exclude = [
32
+ "**/node_modules",
33
+ "**/__pycache__",
34
+ ]
35
+ pythonVersion = "3.9"
36
+
37
+ [tool.pytest.ini_options]
38
+ minversion = "6.0"
39
+ addopts = "-ra -q"
40
+ testpaths = [
41
+ "tests"
42
+ ]
43
+ filterwarnings = [
44
+ "ignore::DeprecationWarning"
45
+ ]
46
+
47
+ [tool.lint.per-file-ignores]
48
+ "__init__.py" = ["F401", "D100", "D104"]
49
+
50
+ [tool.ruff]
51
+ line-length = 100
52
+ lint.select = ["E", "F", "D", "I"]
53
+ lint.ignore = ["D100", "D105"]
54
+ extend-exclude = [
55
+ "*external*",
56
+ "third_party",
57
+ ]
58
+ src = ["depth_pro", "tests"]
59
+ target-version = "py39"
tools/DepthPro/src/depth_pro/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Copyright (C) 2024 Apple Inc. All Rights Reserved.
2
+ """Depth Pro package."""
3
+
4
+ from .depth_pro import create_model_and_transforms # noqa
5
+ from .utils import load_rgb # noqa
tools/DepthPro/src/depth_pro/cli/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Copyright (C) 2024 Apple Inc. All Rights Reserved.
2
+ """Depth Pro CLI and tools."""
3
+
4
+ from .run import main as run_main # noqa
tools/DepthPro/src/depth_pro/cli/run.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Sample script to run DepthPro.
3
+
4
+ Copyright (C) 2024 Apple Inc. All Rights Reserved.
5
+ """
6
+
7
+
8
+ import argparse
9
+ import logging
10
+ from pathlib import Path
11
+
12
+ import numpy as np
13
+ import PIL.Image
14
+ import torch
15
+ from matplotlib import pyplot as plt
16
+ from tqdm import tqdm
17
+
18
+ from depth_pro import create_model_and_transforms, load_rgb
19
+
20
+ LOGGER = logging.getLogger(__name__)
21
+
22
+
23
+ def get_torch_device() -> torch.device:
24
+ """Get the Torch device."""
25
+ device = torch.device("cpu")
26
+ if torch.cuda.is_available():
27
+ device = torch.device("cuda:0")
28
+ elif torch.backends.mps.is_available():
29
+ device = torch.device("mps")
30
+ return device
31
+
32
+
33
+ def run(args):
34
+ """Run Depth Pro on a sample image."""
35
+ if args.verbose:
36
+ logging.basicConfig(level=logging.INFO)
37
+
38
+ # Load model.
39
+ model, transform = create_model_and_transforms(
40
+ device=get_torch_device(),
41
+ precision=torch.half,
42
+ )
43
+ model.eval()
44
+
45
+ image_paths = [args.image_path]
46
+ if args.image_path.is_dir():
47
+ image_paths = args.image_path.glob("**/*")
48
+ relative_path = args.image_path
49
+ else:
50
+ relative_path = args.image_path.parent
51
+
52
+ if not args.skip_display:
53
+ plt.ion()
54
+ fig = plt.figure()
55
+ ax_rgb = fig.add_subplot(121)
56
+ ax_disp = fig.add_subplot(122)
57
+
58
+ for image_path in tqdm(image_paths):
59
+ # Load image and focal length from exif info (if found.).
60
+ try:
61
+ LOGGER.info(f"Loading image {image_path} ...")
62
+ image, _, f_px = load_rgb(image_path)
63
+ except Exception as e:
64
+ LOGGER.error(str(e))
65
+ continue
66
+ # Run prediction. If `f_px` is provided, it is used to estimate the final metric depth,
67
+ # otherwise the model estimates `f_px` to compute the depth metricness.
68
+ prediction = model.infer(transform(image), f_px=f_px)
69
+
70
+ # Extract the depth and focal length.
71
+ depth = prediction["depth"].detach().cpu().numpy().squeeze()
72
+ if f_px is not None:
73
+ LOGGER.debug(f"Focal length (from exif): {f_px:0.2f}")
74
+ elif prediction["focallength_px"] is not None:
75
+ focallength_px = prediction["focallength_px"].detach().cpu().item()
76
+ LOGGER.info(f"Estimated focal length: {focallength_px}")
77
+
78
+ inverse_depth = 1 / depth
79
+ # Visualize inverse depth instead of depth, clipped to [0.1m;250m] range for better visualization.
80
+ max_invdepth_vizu = min(inverse_depth.max(), 1 / 0.1)
81
+ min_invdepth_vizu = max(1 / 250, inverse_depth.min())
82
+ inverse_depth_normalized = (inverse_depth - min_invdepth_vizu) / (
83
+ max_invdepth_vizu - min_invdepth_vizu
84
+ )
85
+
86
+ # Save Depth as npz file.
87
+ if args.output_path is not None:
88
+ output_file = (
89
+ args.output_path
90
+ / image_path.relative_to(relative_path).parent
91
+ / image_path.stem
92
+ )
93
+ LOGGER.info(f"Saving depth map to: {str(output_file)}")
94
+ output_file.parent.mkdir(parents=True, exist_ok=True)
95
+ np.savez_compressed(output_file, depth=depth)
96
+
97
+ # Save as color-mapped "turbo" jpg image.
98
+ cmap = plt.get_cmap("turbo")
99
+ color_depth = (cmap(inverse_depth_normalized)[..., :3] * 255).astype(
100
+ np.uint8
101
+ )
102
+ color_map_output_file = str(output_file) + ".jpg"
103
+ LOGGER.info(f"Saving color-mapped depth to: : {color_map_output_file}")
104
+ PIL.Image.fromarray(color_depth).save(
105
+ color_map_output_file, format="JPEG", quality=90
106
+ )
107
+
108
+ # Display the image and estimated depth map.
109
+ if not args.skip_display:
110
+ ax_rgb.imshow(image)
111
+ ax_disp.imshow(inverse_depth_normalized, cmap="turbo")
112
+ fig.canvas.draw()
113
+ fig.canvas.flush_events()
114
+
115
+ LOGGER.info("Done predicting depth!")
116
+ if not args.skip_display:
117
+ plt.show(block=True)
118
+
119
+
120
+ def main():
121
+ """Run DepthPro inference example."""
122
+ parser = argparse.ArgumentParser(
123
+ description="Inference scripts of DepthPro with PyTorch models."
124
+ )
125
+ parser.add_argument(
126
+ "-i",
127
+ "--image-path",
128
+ type=Path,
129
+ default="./data/example.jpg",
130
+ help="Path to input image.",
131
+ )
132
+ parser.add_argument(
133
+ "-o",
134
+ "--output-path",
135
+ type=Path,
136
+ help="Path to store output files.",
137
+ )
138
+ parser.add_argument(
139
+ "--skip-display",
140
+ action="store_true",
141
+ help="Skip matplotlib display.",
142
+ )
143
+ parser.add_argument(
144
+ "-v",
145
+ "--verbose",
146
+ action="store_true",
147
+ help="Show verbose output."
148
+ )
149
+
150
+ run(parser.parse_args())
151
+
152
+
153
+ if __name__ == "__main__":
154
+ main()
tools/DepthPro/src/depth_pro/depth_pro.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2024 Apple Inc. All Rights Reserved.
2
+ # Depth Pro: Sharp Monocular Metric Depth in Less Than a Second
3
+
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Mapping, Optional, Tuple, Union
9
+
10
+ import torch
11
+ from torch import nn
12
+ from torchvision.transforms import (
13
+ Compose,
14
+ ConvertImageDtype,
15
+ Lambda,
16
+ Normalize,
17
+ ToTensor,
18
+ )
19
+
20
+ from .network.decoder import MultiresConvDecoder
21
+ from .network.encoder import DepthProEncoder
22
+ from .network.fov import FOVNetwork
23
+ from .network.vit_factory import VIT_CONFIG_DICT, ViTPreset, create_vit
24
+
25
+
26
+ @dataclass
27
+ class DepthProConfig:
28
+ """Configuration for DepthPro."""
29
+
30
+ patch_encoder_preset: ViTPreset
31
+ image_encoder_preset: ViTPreset
32
+ decoder_features: int
33
+
34
+ checkpoint_uri: Optional[str] = None
35
+ fov_encoder_preset: Optional[ViTPreset] = None
36
+ use_fov_head: bool = True
37
+
38
+
39
+ DEFAULT_MONODEPTH_CONFIG_DICT = DepthProConfig(
40
+ patch_encoder_preset="dinov2l16_384",
41
+ image_encoder_preset="dinov2l16_384",
42
+ checkpoint_uri="./checkpoints/depth_pro.pt",
43
+ decoder_features=256,
44
+ use_fov_head=True,
45
+ fov_encoder_preset="dinov2l16_384",
46
+ )
47
+
48
+
49
+ def create_backbone_model(
50
+ preset: ViTPreset
51
+ ) -> Tuple[nn.Module, ViTPreset]:
52
+ """Create and load a backbone model given a config.
53
+
54
+ Args:
55
+ ----
56
+ preset: A backbone preset to load pre-defind configs.
57
+
58
+ Returns:
59
+ -------
60
+ A Torch module and the associated config.
61
+
62
+ """
63
+ if preset in VIT_CONFIG_DICT:
64
+ config = VIT_CONFIG_DICT[preset]
65
+ model = create_vit(preset=preset, use_pretrained=False)
66
+ else:
67
+ raise KeyError(f"Preset {preset} not found.")
68
+
69
+ return model, config
70
+
71
+
72
+ def create_model_and_transforms(
73
+ config: DepthProConfig = DEFAULT_MONODEPTH_CONFIG_DICT,
74
+ device: torch.device = torch.device("cpu"),
75
+ precision: torch.dtype = torch.float32,
76
+ ) -> Tuple[DepthPro, Compose]:
77
+ """Create a DepthPro model and load weights from `config.checkpoint_uri`.
78
+
79
+ Args:
80
+ ----
81
+ config: The configuration for the DPT model architecture.
82
+ device: The optional Torch device to load the model onto, default runs on "cpu".
83
+ precision: The optional precision used for the model, default is FP32.
84
+
85
+ Returns:
86
+ -------
87
+ The Torch DepthPro model and associated Transform.
88
+
89
+ """
90
+ patch_encoder, patch_encoder_config = create_backbone_model(
91
+ preset=config.patch_encoder_preset
92
+ )
93
+ image_encoder, _ = create_backbone_model(
94
+ preset=config.image_encoder_preset
95
+ )
96
+
97
+ fov_encoder = None
98
+ if config.use_fov_head and config.fov_encoder_preset is not None:
99
+ fov_encoder, _ = create_backbone_model(preset=config.fov_encoder_preset)
100
+
101
+ dims_encoder = patch_encoder_config.encoder_feature_dims
102
+ hook_block_ids = patch_encoder_config.encoder_feature_layer_ids
103
+ encoder = DepthProEncoder(
104
+ dims_encoder=dims_encoder,
105
+ patch_encoder=patch_encoder,
106
+ image_encoder=image_encoder,
107
+ hook_block_ids=hook_block_ids,
108
+ decoder_features=config.decoder_features,
109
+ )
110
+ decoder = MultiresConvDecoder(
111
+ dims_encoder=[config.decoder_features] + list(encoder.dims_encoder),
112
+ dim_decoder=config.decoder_features,
113
+ )
114
+ model = DepthPro(
115
+ encoder=encoder,
116
+ decoder=decoder,
117
+ last_dims=(32, 1),
118
+ use_fov_head=config.use_fov_head,
119
+ fov_encoder=fov_encoder,
120
+ ).to(device)
121
+
122
+ if precision == torch.half:
123
+ model.half()
124
+
125
+ transform = Compose(
126
+ [
127
+ ToTensor(),
128
+ Lambda(lambda x: x.to(device)),
129
+ Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
130
+ ConvertImageDtype(precision),
131
+ ]
132
+ )
133
+
134
+ if config.checkpoint_uri is not None:
135
+ state_dict = torch.load(config.checkpoint_uri, map_location="cpu")
136
+ missing_keys, unexpected_keys = model.load_state_dict(
137
+ state_dict=state_dict, strict=True
138
+ )
139
+
140
+ if len(unexpected_keys) != 0:
141
+ raise KeyError(
142
+ f"Found unexpected keys when loading monodepth: {unexpected_keys}"
143
+ )
144
+
145
+ # fc_norm is only for the classification head,
146
+ # which we would not use. We only use the encoding.
147
+ missing_keys = [key for key in missing_keys if "fc_norm" not in key]
148
+ if len(missing_keys) != 0:
149
+ raise KeyError(f"Keys are missing when loading monodepth: {missing_keys}")
150
+
151
+ return model, transform
152
+
153
+
154
+ class DepthPro(nn.Module):
155
+ """DepthPro network."""
156
+
157
+ def __init__(
158
+ self,
159
+ encoder: DepthProEncoder,
160
+ decoder: MultiresConvDecoder,
161
+ last_dims: tuple[int, int],
162
+ use_fov_head: bool = True,
163
+ fov_encoder: Optional[nn.Module] = None,
164
+ ):
165
+ """Initialize DepthPro.
166
+
167
+ Args:
168
+ ----
169
+ encoder: The DepthProEncoder backbone.
170
+ decoder: The MultiresConvDecoder decoder.
171
+ last_dims: The dimension for the last convolution layers.
172
+ use_fov_head: Whether to use the field-of-view head.
173
+ fov_encoder: A separate encoder for the field of view.
174
+
175
+ """
176
+ super().__init__()
177
+
178
+ self.encoder = encoder
179
+ self.decoder = decoder
180
+
181
+ dim_decoder = decoder.dim_decoder
182
+ self.head = nn.Sequential(
183
+ nn.Conv2d(
184
+ dim_decoder, dim_decoder // 2, kernel_size=3, stride=1, padding=1
185
+ ),
186
+ nn.ConvTranspose2d(
187
+ in_channels=dim_decoder // 2,
188
+ out_channels=dim_decoder // 2,
189
+ kernel_size=2,
190
+ stride=2,
191
+ padding=0,
192
+ bias=True,
193
+ ),
194
+ nn.Conv2d(
195
+ dim_decoder // 2,
196
+ last_dims[0],
197
+ kernel_size=3,
198
+ stride=1,
199
+ padding=1,
200
+ ),
201
+ nn.ReLU(True),
202
+ nn.Conv2d(last_dims[0], last_dims[1], kernel_size=1, stride=1, padding=0),
203
+ nn.ReLU(),
204
+ )
205
+
206
+ # Set the final convolution layer's bias to be 0.
207
+ self.head[4].bias.data.fill_(0)
208
+
209
+ # Set the FOV estimation head.
210
+ if use_fov_head:
211
+ self.fov = FOVNetwork(num_features=dim_decoder, fov_encoder=fov_encoder)
212
+
213
+ @property
214
+ def img_size(self) -> int:
215
+ """Return the internal image size of the network."""
216
+ return self.encoder.img_size
217
+
218
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
219
+ """Decode by projection and fusion of multi-resolution encodings.
220
+
221
+ Args:
222
+ ----
223
+ x (torch.Tensor): Input image.
224
+
225
+ Returns:
226
+ -------
227
+ The canonical inverse depth map [m] and the optional estimated field of view [deg].
228
+
229
+ """
230
+ _, _, H, W = x.shape
231
+ assert H == self.img_size and W == self.img_size
232
+
233
+ encodings = self.encoder(x)
234
+ features, features_0 = self.decoder(encodings)
235
+ canonical_inverse_depth = self.head(features)
236
+
237
+ fov_deg = None
238
+ if hasattr(self, "fov"):
239
+ fov_deg = self.fov.forward(x, features_0.detach())
240
+
241
+ return canonical_inverse_depth, fov_deg
242
+
243
+ @torch.no_grad()
244
+ def infer(
245
+ self,
246
+ x: torch.Tensor,
247
+ f_px: Optional[Union[float, torch.Tensor]] = None,
248
+ interpolation_mode="bilinear",
249
+ ) -> Mapping[str, torch.Tensor]:
250
+ """Infer depth and fov for a given image.
251
+
252
+ If the image is not at network resolution, it is resized to 1536x1536 and
253
+ the estimated depth is resized to the original image resolution.
254
+ Note: if the focal length is given, the estimated value is ignored and the provided
255
+ focal length is use to generate the metric depth values.
256
+
257
+ Args:
258
+ ----
259
+ x (torch.Tensor): Input image
260
+ f_px (torch.Tensor): Optional focal length in pixels corresponding to `x`.
261
+ interpolation_mode (str): Interpolation function for downsampling/upsampling.
262
+
263
+ Returns:
264
+ -------
265
+ Tensor dictionary (torch.Tensor): depth [m], focallength [pixels].
266
+
267
+ """
268
+ if len(x.shape) == 3:
269
+ x = x.unsqueeze(0)
270
+ _, _, H, W = x.shape
271
+ resize = H != self.img_size or W != self.img_size
272
+
273
+ if resize:
274
+ x = nn.functional.interpolate(
275
+ x,
276
+ size=(self.img_size, self.img_size),
277
+ mode=interpolation_mode,
278
+ align_corners=False,
279
+ )
280
+
281
+ canonical_inverse_depth, fov_deg = self.forward(x)
282
+ if f_px is None:
283
+ f_px = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_deg.to(torch.float)))
284
+
285
+ inverse_depth = canonical_inverse_depth * (W / f_px)
286
+ f_px = f_px.squeeze()
287
+
288
+ if resize:
289
+ inverse_depth = nn.functional.interpolate(
290
+ inverse_depth, size=(H, W), mode=interpolation_mode, align_corners=False
291
+ )
292
+
293
+ depth = 1.0 / torch.clamp(inverse_depth, min=1e-4, max=1e4)
294
+
295
+ return {
296
+ "depth": depth.squeeze(),
297
+ "focallength_px": f_px,
298
+ }
tools/DepthPro/src/depth_pro/eval/boundary_metrics.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+
3
+ import numpy as np
4
+
5
+
6
+ def connected_component(r: np.ndarray, c: np.ndarray) -> List[List[int]]:
7
+ """Find connected components in the given row and column indices.
8
+
9
+ Args:
10
+ ----
11
+ r (np.ndarray): Row indices.
12
+ c (np.ndarray): Column indices.
13
+
14
+ Yields:
15
+ ------
16
+ List[int]: Indices of connected components.
17
+
18
+ """
19
+ indices = [0]
20
+ for i in range(1, r.size):
21
+ if r[i] == r[indices[-1]] and c[i] == c[indices[-1]] + 1:
22
+ indices.append(i)
23
+ else:
24
+ yield indices
25
+ indices = [i]
26
+ yield indices
27
+
28
+
29
+ def nms_horizontal(ratio: np.ndarray, threshold: float) -> np.ndarray:
30
+ """Apply Non-Maximum Suppression (NMS) horizontally on the given ratio matrix.
31
+
32
+ Args:
33
+ ----
34
+ ratio (np.ndarray): Input ratio matrix.
35
+ threshold (float): Threshold for NMS.
36
+
37
+ Returns:
38
+ -------
39
+ np.ndarray: Binary mask after applying NMS.
40
+
41
+ """
42
+ mask = np.zeros_like(ratio, dtype=bool)
43
+ r, c = np.nonzero(ratio > threshold)
44
+ if len(r) == 0:
45
+ return mask
46
+ for ids in connected_component(r, c):
47
+ values = [ratio[r[i], c[i]] for i in ids]
48
+ mi = np.argmax(values)
49
+ mask[r[ids[mi]], c[ids[mi]]] = True
50
+ return mask
51
+
52
+
53
+ def nms_vertical(ratio: np.ndarray, threshold: float) -> np.ndarray:
54
+ """Apply Non-Maximum Suppression (NMS) vertically on the given ratio matrix.
55
+
56
+ Args:
57
+ ----
58
+ ratio (np.ndarray): Input ratio matrix.
59
+ threshold (float): Threshold for NMS.
60
+
61
+ Returns:
62
+ -------
63
+ np.ndarray: Binary mask after applying NMS.
64
+
65
+ """
66
+ return np.transpose(nms_horizontal(np.transpose(ratio), threshold))
67
+
68
+
69
+ def fgbg_depth(
70
+ d: np.ndarray, t: float
71
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
72
+ """Find foreground-background relations between neighboring pixels.
73
+
74
+ Args:
75
+ ----
76
+ d (np.ndarray): Depth matrix.
77
+ t (float): Threshold for comparison.
78
+
79
+ Returns:
80
+ -------
81
+ Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
82
+ left, top, right, and bottom foreground-background relations.
83
+
84
+ """
85
+ right_is_big_enough = (d[..., :, 1:] / d[..., :, :-1]) > t
86
+ left_is_big_enough = (d[..., :, :-1] / d[..., :, 1:]) > t
87
+ bottom_is_big_enough = (d[..., 1:, :] / d[..., :-1, :]) > t
88
+ top_is_big_enough = (d[..., :-1, :] / d[..., 1:, :]) > t
89
+ return (
90
+ left_is_big_enough,
91
+ top_is_big_enough,
92
+ right_is_big_enough,
93
+ bottom_is_big_enough,
94
+ )
95
+
96
+
97
+ def fgbg_depth_thinned(
98
+ d: np.ndarray, t: float
99
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
100
+ """Find foreground-background relations between neighboring pixels with Non-Maximum Suppression.
101
+
102
+ Args:
103
+ ----
104
+ d (np.ndarray): Depth matrix.
105
+ t (float): Threshold for NMS.
106
+
107
+ Returns:
108
+ -------
109
+ Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
110
+ left, top, right, and bottom foreground-background relations with NMS applied.
111
+
112
+ """
113
+ right_is_big_enough = nms_horizontal(d[..., :, 1:] / d[..., :, :-1], t)
114
+ left_is_big_enough = nms_horizontal(d[..., :, :-1] / d[..., :, 1:], t)
115
+ bottom_is_big_enough = nms_vertical(d[..., 1:, :] / d[..., :-1, :], t)
116
+ top_is_big_enough = nms_vertical(d[..., :-1, :] / d[..., 1:, :], t)
117
+ return (
118
+ left_is_big_enough,
119
+ top_is_big_enough,
120
+ right_is_big_enough,
121
+ bottom_is_big_enough,
122
+ )
123
+
124
+
125
+ def fgbg_binary_mask(
126
+ d: np.ndarray,
127
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
128
+ """Find foreground-background relations between neighboring pixels in binary masks.
129
+
130
+ Args:
131
+ ----
132
+ d (np.ndarray): Binary depth matrix.
133
+
134
+ Returns:
135
+ -------
136
+ Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
137
+ left, top, right, and bottom foreground-background relations in binary masks.
138
+
139
+ """
140
+ assert d.dtype == bool
141
+ right_is_big_enough = d[..., :, 1:] & ~d[..., :, :-1]
142
+ left_is_big_enough = d[..., :, :-1] & ~d[..., :, 1:]
143
+ bottom_is_big_enough = d[..., 1:, :] & ~d[..., :-1, :]
144
+ top_is_big_enough = d[..., :-1, :] & ~d[..., 1:, :]
145
+ return (
146
+ left_is_big_enough,
147
+ top_is_big_enough,
148
+ right_is_big_enough,
149
+ bottom_is_big_enough,
150
+ )
151
+
152
+
153
+ def edge_recall_matting(pr: np.ndarray, gt: np.ndarray, t: float) -> float:
154
+ """Calculate edge recall for image matting.
155
+
156
+ Args:
157
+ ----
158
+ pr (np.ndarray): Predicted depth matrix.
159
+ gt (np.ndarray): Ground truth binary mask.
160
+ t (float): Threshold for NMS.
161
+
162
+ Returns:
163
+ -------
164
+ float: Edge recall value.
165
+
166
+ """
167
+ assert gt.dtype == bool
168
+ ap, bp, cp, dp = fgbg_depth_thinned(pr, t)
169
+ ag, bg, cg, dg = fgbg_binary_mask(gt)
170
+ return 0.25 * (
171
+ np.count_nonzero(ap & ag) / max(np.count_nonzero(ag), 1)
172
+ + np.count_nonzero(bp & bg) / max(np.count_nonzero(bg), 1)
173
+ + np.count_nonzero(cp & cg) / max(np.count_nonzero(cg), 1)
174
+ + np.count_nonzero(dp & dg) / max(np.count_nonzero(dg), 1)
175
+ )
176
+
177
+
178
+ def boundary_f1(
179
+ pr: np.ndarray,
180
+ gt: np.ndarray,
181
+ t: float,
182
+ return_p: bool = False,
183
+ return_r: bool = False,
184
+ ) -> float:
185
+ """Calculate Boundary F1 score.
186
+
187
+ Args:
188
+ ----
189
+ pr (np.ndarray): Predicted depth matrix.
190
+ gt (np.ndarray): Ground truth depth matrix.
191
+ t (float): Threshold for comparison.
192
+ return_p (bool, optional): If True, return precision. Defaults to False.
193
+ return_r (bool, optional): If True, return recall. Defaults to False.
194
+
195
+ Returns:
196
+ -------
197
+ float: Boundary F1 score, or precision, or recall depending on the flags.
198
+
199
+ """
200
+ ap, bp, cp, dp = fgbg_depth(pr, t)
201
+ ag, bg, cg, dg = fgbg_depth(gt, t)
202
+
203
+ r = 0.25 * (
204
+ np.count_nonzero(ap & ag) / max(np.count_nonzero(ag), 1)
205
+ + np.count_nonzero(bp & bg) / max(np.count_nonzero(bg), 1)
206
+ + np.count_nonzero(cp & cg) / max(np.count_nonzero(cg), 1)
207
+ + np.count_nonzero(dp & dg) / max(np.count_nonzero(dg), 1)
208
+ )
209
+ p = 0.25 * (
210
+ np.count_nonzero(ap & ag) / max(np.count_nonzero(ap), 1)
211
+ + np.count_nonzero(bp & bg) / max(np.count_nonzero(bp), 1)
212
+ + np.count_nonzero(cp & cg) / max(np.count_nonzero(cp), 1)
213
+ + np.count_nonzero(dp & dg) / max(np.count_nonzero(dp), 1)
214
+ )
215
+ if r + p == 0:
216
+ return 0.0
217
+ if return_p:
218
+ return p
219
+ if return_r:
220
+ return r
221
+ return 2 * (r * p) / (r + p)
222
+
223
+
224
+ def get_thresholds_and_weights(
225
+ t_min: float, t_max: float, N: int
226
+ ) -> Tuple[np.ndarray, np.ndarray]:
227
+ """Generate thresholds and weights for the given range.
228
+
229
+ Args:
230
+ ----
231
+ t_min (float): Minimum threshold.
232
+ t_max (float): Maximum threshold.
233
+ N (int): Number of thresholds.
234
+
235
+ Returns:
236
+ -------
237
+ Tuple[np.ndarray, np.ndarray]: Array of thresholds and corresponding weights.
238
+
239
+ """
240
+ thresholds = np.linspace(t_min, t_max, N)
241
+ weights = thresholds / thresholds.sum()
242
+ return thresholds, weights
243
+
244
+
245
+ def invert_depth(depth: np.ndarray, eps: float = 1e-6) -> np.ndarray:
246
+ """Inverts a depth map with numerical stability.
247
+
248
+ Args:
249
+ ----
250
+ depth (np.ndarray): Depth map to be inverted.
251
+ eps (float): Minimum value to avoid division by zero (default is 1e-6).
252
+
253
+ Returns:
254
+ -------
255
+ np.ndarray: Inverted depth map.
256
+
257
+ """
258
+ inverse_depth = 1.0 / depth.clip(min=eps)
259
+ return inverse_depth
260
+
261
+
262
+ def SI_boundary_F1(
263
+ predicted_depth: np.ndarray,
264
+ target_depth: np.ndarray,
265
+ t_min: float = 1.05,
266
+ t_max: float = 1.25,
267
+ N: int = 10,
268
+ ) -> float:
269
+ """Calculate Scale-Invariant Boundary F1 Score for depth-based ground-truth.
270
+
271
+ Args:
272
+ ----
273
+ predicted_depth (np.ndarray): Predicted depth matrix.
274
+ target_depth (np.ndarray): Ground truth depth matrix.
275
+ t_min (float, optional): Minimum threshold. Defaults to 1.05.
276
+ t_max (float, optional): Maximum threshold. Defaults to 1.25.
277
+ N (int, optional): Number of thresholds. Defaults to 10.
278
+
279
+ Returns:
280
+ -------
281
+ float: Scale-Invariant Boundary F1 Score.
282
+
283
+ """
284
+ assert predicted_depth.ndim == target_depth.ndim == 2
285
+ thresholds, weights = get_thresholds_and_weights(t_min, t_max, N)
286
+ f1_scores = np.array(
287
+ [
288
+ boundary_f1(invert_depth(predicted_depth), invert_depth(target_depth), t)
289
+ for t in thresholds
290
+ ]
291
+ )
292
+ return np.sum(f1_scores * weights)
293
+
294
+
295
+ def SI_boundary_Recall(
296
+ predicted_depth: np.ndarray,
297
+ target_mask: np.ndarray,
298
+ t_min: float = 1.05,
299
+ t_max: float = 1.25,
300
+ N: int = 10,
301
+ alpha_threshold: float = 0.1,
302
+ ) -> float:
303
+ """Calculate Scale-Invariant Boundary Recall Score for mask-based ground-truth.
304
+
305
+ Args:
306
+ ----
307
+ predicted_depth (np.ndarray): Predicted depth matrix.
308
+ target_mask (np.ndarray): Ground truth binary mask.
309
+ t_min (float, optional): Minimum threshold. Defaults to 1.05.
310
+ t_max (float, optional): Maximum threshold. Defaults to 1.25.
311
+ N (int, optional): Number of thresholds. Defaults to 10.
312
+ alpha_threshold (float, optional): Threshold for alpha masking. Defaults to 0.1.
313
+
314
+ Returns:
315
+ -------
316
+ float: Scale-Invariant Boundary Recall Score.
317
+
318
+ """
319
+ assert predicted_depth.ndim == target_mask.ndim == 2
320
+ thresholds, weights = get_thresholds_and_weights(t_min, t_max, N)
321
+ thresholded_target = target_mask > alpha_threshold
322
+
323
+ recall_scores = np.array(
324
+ [
325
+ edge_recall_matting(
326
+ invert_depth(predicted_depth), thresholded_target, t=float(t)
327
+ )
328
+ for t in thresholds
329
+ ]
330
+ )
331
+ weighted_recall = np.sum(recall_scores * weights)
332
+ return weighted_recall
tools/DepthPro/src/depth_pro/eval/dis5k_sample_list.txt ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DIS5K/DIS-TE1/im/12#Graphics#4#TrafficSign#8245751856_821be14f86_o.jpg
2
+ DIS5K/DIS-TE1/im/13#Insect#4#Butterfly#16023994688_7ff8cdccb1_o.jpg
3
+ DIS5K/DIS-TE1/im/14#Kitchenware#4#Kitchenware#IMG_20210520_205538.jpg
4
+ DIS5K/DIS-TE1/im/14#Kitchenware#8#SweetStand#4848284981_fc90f54b50_o.jpg
5
+ DIS5K/DIS-TE1/im/17#Non-motor Vehicle#4#Cart#15012855035_d10b57014f_o.jpg
6
+ DIS5K/DIS-TE1/im/2#Aircraft#5#Kite#13104545564_5afceec9bd_o.jpg
7
+ DIS5K/DIS-TE1/im/20#Sports#10#Skateboarding#8472763540_bb2390e928_o.jpg
8
+ DIS5K/DIS-TE1/im/21#Tool#14#Sword#32473146960_dcc6b77848_o.jpg
9
+ DIS5K/DIS-TE1/im/21#Tool#15#Tapeline#9680492386_2d2020f282_o.jpg
10
+ DIS5K/DIS-TE1/im/21#Tool#4#Flag#507752845_ef852100f0_o.jpg
11
+ DIS5K/DIS-TE1/im/21#Tool#6#Key#11966089533_3becd78b44_o.jpg
12
+ DIS5K/DIS-TE1/im/21#Tool#8#Scale#31946428472_d28def471b_o.jpg
13
+ DIS5K/DIS-TE1/im/22#Weapon#4#Rifle#8472656430_3eb908b211_o.jpg
14
+ DIS5K/DIS-TE1/im/8#Electronics#3#Earphone#1177468301_641df8c267_o.jpg
15
+ DIS5K/DIS-TE1/im/8#Electronics#9#MusicPlayer#2235782872_7d47847bb4_o.jpg
16
+ DIS5K/DIS-TE2/im/11#Furniture#13#Ladder#3878434417_2ed740586e_o.jpg
17
+ DIS5K/DIS-TE2/im/13#Insect#1#Ant#27047700955_3b3a1271f8_o.jpg
18
+ DIS5K/DIS-TE2/im/13#Insect#11#Spider#5567179191_38d1f65589_o.jpg
19
+ DIS5K/DIS-TE2/im/13#Insect#8#Locust#5237933769_e6687c05e4_o.jpg
20
+ DIS5K/DIS-TE2/im/14#Kitchenware#2#DishRack#70838854_40cf689da7_o.jpg
21
+ DIS5K/DIS-TE2/im/14#Kitchenware#8#SweetStand#8467929412_fef7f4275d_o.jpg
22
+ DIS5K/DIS-TE2/im/16#Music Instrument#2#Harp#28058219806_28e05ff24a_o.jpg
23
+ DIS5K/DIS-TE2/im/17#Non-motor Vehicle#1#BabyCarriage#29794777180_2e1695a0cf_o.jpg
24
+ DIS5K/DIS-TE2/im/19#Ship#3#Sailboat#22442908623_5977e3becf_o.jpg
25
+ DIS5K/DIS-TE2/im/2#Aircraft#5#Kite#44654358051_1400e71cc4_o.jpg
26
+ DIS5K/DIS-TE2/im/21#Tool#11#Stand#IMG_20210520_205442.jpg
27
+ DIS5K/DIS-TE2/im/21#Tool#17#Tripod#9318977876_34615ec9a0_o.jpg
28
+ DIS5K/DIS-TE2/im/5#Artifact#3#Handcraft#50860882577_8482143b1b_o.jpg
29
+ DIS5K/DIS-TE2/im/8#Electronics#10#Robot#3093360210_fee54dc5c5_o.jpg
30
+ DIS5K/DIS-TE2/im/8#Electronics#6#Microphone#47411477652_6da66cbc10_o.jpg
31
+ DIS5K/DIS-TE3/im/14#Kitchenware#4#Kitchenware#2451122898_ef883175dd_o.jpg
32
+ DIS5K/DIS-TE3/im/15#Machine#4#SewingMachine#9311164128_97ba1d3947_o.jpg
33
+ DIS5K/DIS-TE3/im/16#Music Instrument#2#Harp#7670920550_59e992fd7b_o.jpg
34
+ DIS5K/DIS-TE3/im/17#Non-motor Vehicle#1#BabyCarriage#8389984877_1fddf8715c_o.jpg
35
+ DIS5K/DIS-TE3/im/17#Non-motor Vehicle#3#Carriage#5947122724_98e0fc3d1f_o.jpg
36
+ DIS5K/DIS-TE3/im/2#Aircraft#2#Balloon#2487168092_641505883f_o.jpg
37
+ DIS5K/DIS-TE3/im/2#Aircraft#4#Helicopter#8401177591_06c71c8df2_o.jpg
38
+ DIS5K/DIS-TE3/im/20#Sports#1#Archery#12520003103_faa43ea3e0_o.jpg
39
+ DIS5K/DIS-TE3/im/21#Tool#11#Stand#IMG_20210709_221507.jpg
40
+ DIS5K/DIS-TE3/im/21#Tool#2#Clip#5656649687_63d0c6696d_o.jpg
41
+ DIS5K/DIS-TE3/im/21#Tool#6#Key#12878459244_6387a140ea_o.jpg
42
+ DIS5K/DIS-TE3/im/3#Aquatic#1#Lobster#109214461_f52b4b6093_o.jpg
43
+ DIS5K/DIS-TE3/im/4#Architecture#19#Windmill#20195851863_2627117e0e_o.jpg
44
+ DIS5K/DIS-TE3/im/5#Artifact#2#Cage#5821476369_ea23927487_o.jpg
45
+ DIS5K/DIS-TE3/im/8#Electronics#7#MobileHolder#49732997896_7f53c290b5_o.jpg
46
+ DIS5K/DIS-TE4/im/13#Insect#6#Centipede#15302179708_a267850881_o.jpg
47
+ DIS5K/DIS-TE4/im/17#Non-motor Vehicle#11#Tricycle#5771069105_a3aef6f665_o.jpg
48
+ DIS5K/DIS-TE4/im/17#Non-motor Vehicle#2#Bicycle#4245936196_fdf812dcb7_o.jpg
49
+ DIS5K/DIS-TE4/im/17#Non-motor Vehicle#9#ShoppingCart#4674052920_a5b7a2b236_o.jpg
50
+ DIS5K/DIS-TE4/im/18#Plant#1#Bonsai#3539420884_ca8973e2c0_o.jpg
51
+ DIS5K/DIS-TE4/im/2#Aircraft#6#Parachute#33590416634_9d6f2325e7_o.jpg
52
+ DIS5K/DIS-TE4/im/20#Sports#1#Archery#46924476515_0be1caa684_o.jpg
53
+ DIS5K/DIS-TE4/im/20#Sports#8#Racket#19337607166_dd1985fb59_o.jpg
54
+ DIS5K/DIS-TE4/im/21#Tool#6#Key#3193329588_839b0c74ce_o.jpg
55
+ DIS5K/DIS-TE4/im/5#Artifact#2#Cage#5821886526_0573ba2d0d_o.jpg
56
+ DIS5K/DIS-TE4/im/5#Artifact#3#Handcraft#50105138282_3c1d02c968_o.jpg
57
+ DIS5K/DIS-TE4/im/8#Electronics#1#Antenna#4305034305_874f21a701_o.jpg
58
+ DIS5K/DIS-TR/im/1#Accessories#1#Bag#15554964549_3105e51b6f_o.jpg
59
+ DIS5K/DIS-TR/im/1#Accessories#1#Bag#41104261980_098a6c4a56_o.jpg
60
+ DIS5K/DIS-TR/im/1#Accessories#2#Clothes#2284764037_871b2e8ca4_o.jpg
61
+ DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#1824643784_70d0134156_o.jpg
62
+ DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#3590020230_37b09a29b3_o.jpg
63
+ DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#4809652879_4da8a69f3b_o.jpg
64
+ DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#792204934_f9b28f99b4_o.jpg
65
+ DIS5K/DIS-TR/im/1#Accessories#5#Jewelry#13909132974_c4750c5fb7_o.jpg
66
+ DIS5K/DIS-TR/im/1#Accessories#7#Shoe#2483391615_9199ece8d6_o.jpg
67
+ DIS5K/DIS-TR/im/1#Accessories#8#Watch#4343266960_f6633b029b_o.jpg
68
+ DIS5K/DIS-TR/im/10#Frame#2#BicycleFrame#17897573_42964dd104_o.jpg
69
+ DIS5K/DIS-TR/im/10#Frame#5#Rack#15898634812_64807069ff_o.jpg
70
+ DIS5K/DIS-TR/im/10#Frame#5#Rack#23928546819_c184cb0b60_o.jpg
71
+ DIS5K/DIS-TR/im/11#Furniture#19#Shower#6189119596_77bcfe80ee_o.jpg
72
+ DIS5K/DIS-TR/im/11#Furniture#2#Bench#3263647075_9306e280b5_o.jpg
73
+ DIS5K/DIS-TR/im/11#Furniture#5#CoatHanger#12774091054_cd5ff520ef_o.jpg
74
+ DIS5K/DIS-TR/im/11#Furniture#6#DentalChair#13878156865_d0439dcb32_o.jpg
75
+ DIS5K/DIS-TR/im/11#Furniture#9#Easel#5861024714_2070cd480c_o.jpg
76
+ DIS5K/DIS-TR/im/12#Graphics#4#TrafficSign#40621867334_f3c32ec189_o.jpg
77
+ DIS5K/DIS-TR/im/13#Insect#1#Ant#3295038190_db5dd0d4f4_o.jpg
78
+ DIS5K/DIS-TR/im/13#Insect#10#Mosquito#24341339_a88a1dad4c_o.jpg
79
+ DIS5K/DIS-TR/im/13#Insect#11#Spider#27171518270_63b78069ff_o.jpg
80
+ DIS5K/DIS-TR/im/13#Insect#11#Spider#49925050281_fa727c154e_o.jpg
81
+ DIS5K/DIS-TR/im/13#Insect#2#Beatle#279616486_2f1e64f591_o.jpg
82
+ DIS5K/DIS-TR/im/13#Insect#3#Bee#43892067695_82cf3e536b_o.jpg
83
+ DIS5K/DIS-TR/im/13#Insect#6#Centipede#20874281788_3e15c90a1c_o.jpg
84
+ DIS5K/DIS-TR/im/13#Insect#7#Dragonfly#14106671120_1b824d77e4_o.jpg
85
+ DIS5K/DIS-TR/im/13#Insect#8#Locust#21637491048_676ef7c9f7_o.jpg
86
+ DIS5K/DIS-TR/im/13#Insect#9#Mantis#1381120202_9dff6987b2_o.jpg
87
+ DIS5K/DIS-TR/im/14#Kitchenware#1#Cup#12812517473_327d6474b8_o.jpg
88
+ DIS5K/DIS-TR/im/14#Kitchenware#10#WineGlass#6402491641_389275d4d1_o.jpg
89
+ DIS5K/DIS-TR/im/14#Kitchenware#3#Hydrovalve#3129932040_8c05825004_o.jpg
90
+ DIS5K/DIS-TR/im/14#Kitchenware#4#Kitchenware#2881934780_87d5218ebb_o.jpg
91
+ DIS5K/DIS-TR/im/14#Kitchenware#4#Kitchenware#IMG_20210520_205527.jpg
92
+ DIS5K/DIS-TR/im/14#Kitchenware#6#Spoon#32989113501_b69eccf0df_o.jpg
93
+ DIS5K/DIS-TR/im/14#Kitchenware#8#SweetStand#2867322189_c56d1e0b87_o.jpg
94
+ DIS5K/DIS-TR/im/15#Machine#1#Gear#19217846720_f5f2807475_o.jpg
95
+ DIS5K/DIS-TR/im/15#Machine#2#Machine#1620160659_9571b7a7ab_o.jpg
96
+ DIS5K/DIS-TR/im/16#Music Instrument#2#Harp#6012801603_1a6e2c16a6_o.jpg
97
+ DIS5K/DIS-TR/im/16#Music Instrument#5#Trombone#8683292118_d223c17ccb_o.jpg
98
+ DIS5K/DIS-TR/im/16#Music Instrument#6#Trumpet#8393262740_b8c216142c_o.jpg
99
+ DIS5K/DIS-TR/im/16#Music Instrument#8#Violin#1511267391_40e4949d68_o.jpg
100
+ DIS5K/DIS-TR/im/17#Non-motor Vehicle#1#BabyCarriage#6989512997_38b3dbc88b_o.jpg
101
+ DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#14627183228_b2d68cf501_o.jpg
102
+ DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#2932226475_1b2403e549_o.jpg
103
+ DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#5420155648_86459905b8_o.jpg
104
+ DIS5K/DIS-TR/im/17#Non-motor Vehicle#2#Bicycle#IMG_20210513_134904.jpg
105
+ DIS5K/DIS-TR/im/17#Non-motor Vehicle#3#Carriage#3311962551_6f211b7bd6_o.jpg
106
+ DIS5K/DIS-TR/im/17#Non-motor Vehicle#4#Cart#2609732026_baf7fff3a1_o.jpg
107
+ DIS5K/DIS-TR/im/17#Non-motor Vehicle#5#Handcart#5821282211_201cefeaf2_o.jpg
108
+ DIS5K/DIS-TR/im/17#Non-motor Vehicle#7#Mower#5779003232_3bb3ae531a_o.jpg
109
+ DIS5K/DIS-TR/im/17#Non-motor Vehicle#9#ShoppingCart#10051622843_ace07e32b8_o.jpg
110
+ DIS5K/DIS-TR/im/17#Non-motor Vehicle#9#ShoppingCart#8075259294_f23e243849_o.jpg
111
+ DIS5K/DIS-TR/im/18#Plant#2#Tree#44800999741_e377e16dbb_o.jpg
112
+ DIS5K/DIS-TR/im/2#Aircraft#1#Airplane#2631761913_3ac67d0223_o.jpg
113
+ DIS5K/DIS-TR/im/2#Aircraft#1#Airplane#37707911566_e908a261b6_o.jpg
114
+ DIS5K/DIS-TR/im/2#Aircraft#3#HangGlider#2557220131_b8506920c5_o.jpg
115
+ DIS5K/DIS-TR/im/2#Aircraft#4#Helicopter#6215659280_5dbd9b4546_o.jpg
116
+ DIS5K/DIS-TR/im/2#Aircraft#6#Parachute#20185790493_e56fcaf8c6_o.jpg
117
+ DIS5K/DIS-TR/im/20#Sports#1#Archery#3871269982_ae4c59a7eb_o.jpg
118
+ DIS5K/DIS-TR/im/20#Sports#9#RockClimbing#9662433268_51299bc50e_o.jpg
119
+ DIS5K/DIS-TR/im/21#Tool#14#Sword#26258479365_2950d7fa37_o.jpg
120
+ DIS5K/DIS-TR/im/21#Tool#15#Tapeline#15505703447_e0fdeaa5a6_o.jpg
121
+ DIS5K/DIS-TR/im/21#Tool#4#Flag#26678602024_9b665742de_o.jpg
122
+ DIS5K/DIS-TR/im/21#Tool#4#Flag#5774823110_d603ce3cc8_o.jpg
123
+ DIS5K/DIS-TR/im/21#Tool#5#Hook#6867989814_dba18d673c_o.jpg
124
+ DIS5K/DIS-TR/im/22#Weapon#4#Rifle#4451713125_cd91719189_o.jpg
125
+ DIS5K/DIS-TR/im/3#Aquatic#2#Seadragon#4910944581_913139b238_o.jpg
126
+ DIS5K/DIS-TR/im/4#Architecture#12#Scaffold#3661448960_8aff24cc4d_o.jpg
127
+ DIS5K/DIS-TR/im/4#Architecture#13#Sculpture#6385318715_9a88d4eba7_o.jpg
128
+ DIS5K/DIS-TR/im/4#Architecture#17#Well#5011603479_75cf42808a_o.jpg
129
+ DIS5K/DIS-TR/im/5#Artifact#2#Cage#4892828841_7f1bc05682_o.jpg
130
+ DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#15404211628_9e9ff2ce2e_o.jpg
131
+ DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#3200169865_7c84cfcccf_o.jpg
132
+ DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#5859295071_c217e7c22f_o.jpg
133
+ DIS5K/DIS-TR/im/6#Automobile#10#SteeringWheel#17200338026_f1e2122d8e_o.jpg
134
+ DIS5K/DIS-TR/im/6#Automobile#3#Car#3780893425_1a7d275e09_o.jpg
135
+ DIS5K/DIS-TR/im/6#Automobile#5#Crane#15282506502_1b1132a7c3_o.jpg
136
+ DIS5K/DIS-TR/im/7#Electrical#1#Cable#16767791875_8e6df41752_o.jpg
137
+ DIS5K/DIS-TR/im/7#Electrical#1#Cable#3291433361_38747324c4_o.jpg
138
+ DIS5K/DIS-TR/im/7#Electrical#1#Cable#4195104238_12a754c61a_o.jpg
139
+ DIS5K/DIS-TR/im/7#Electrical#1#Cable#49645415132_61e5664ecf_o.jpg
140
+ DIS5K/DIS-TR/im/7#Electrical#1#Cable#IMG_20210521_232406.jpg
141
+ DIS5K/DIS-TR/im/7#Electrical#10#UtilityPole#3298312021_92f431e3e9_o.jpg
142
+ DIS5K/DIS-TR/im/7#Electrical#10#UtilityPole#47950134773_fbfff63f4e_o.jpg
143
+ DIS5K/DIS-TR/im/7#Electrical#11#VacuumCleaner#5448403677_6a29e21881_o.jpg
144
+ DIS5K/DIS-TR/im/7#Electrical#2#CeilingLamp#611568868_680ed5d39f_o.jpg
145
+ DIS5K/DIS-TR/im/7#Electrical#3#Fan#3391683115_990525a693_o.jpg
146
+ DIS5K/DIS-TR/im/7#Electrical#6#StreetLamp#150049122_0692266618_o.jpg
147
+ DIS5K/DIS-TR/im/7#Electrical#9#TransmissionTower#31433908671_7e7e277dfe_o.jpg
148
+ DIS5K/DIS-TR/im/8#Electronics#1#Antenna#8727884873_e0622ee5c4_o.jpg
149
+ DIS5K/DIS-TR/im/8#Electronics#2#Camcorder#4172690390_7e5f280ace_o.jpg
150
+ DIS5K/DIS-TR/im/8#Electronics#3#Earphone#413984555_f290febdf5_o.jpg
151
+ DIS5K/DIS-TR/im/8#Electronics#5#Headset#30574225373_3717ed9fa4_o.jpg
152
+ DIS5K/DIS-TR/im/8#Electronics#6#Microphone#538006482_4aae4f5bd6_o.jpg
153
+ DIS5K/DIS-TR/im/8#Electronics#9#MusicPlayer#1306012480_2ea80d2afd_o.jpg
154
+ DIS5K/DIS-TR/im/9#Entertainment#1#GymEquipment#33071754135_8f3195cbd1_o.jpg
155
+ DIS5K/DIS-TR/im/9#Entertainment#2#KidsPlayground#2305807849_be53d724ea_o.jpg
156
+ DIS5K/DIS-TR/im/9#Entertainment#2#KidsPlayground#3862040422_5bbf903204_o.jpg
157
+ DIS5K/DIS-TR/im/9#Entertainment#3#OutdoorFitnessEquipment#10814507005_3dacaa28b3_o.jpg
158
+ DIS5K/DIS-TR/im/9#Entertainment#4#FerrisWheel#81640293_4b0ee62040_o.jpg
159
+ DIS5K/DIS-TR/im/9#Entertainment#5#Swing#49867339188_08073f4b76_o.jpg
160
+ DIS5K/DIS-VD/im/1#Accessories#1#Bag#6815402415_e01c1a41e6_o.jpg
161
+ DIS5K/DIS-VD/im/1#Accessories#5#Jewelry#2744070193_1486582e8d_o.jpg
162
+ DIS5K/DIS-VD/im/10#Frame#1#BasketballHoop#IMG_20210521_232650.jpg
163
+ DIS5K/DIS-VD/im/10#Frame#5#Rack#6156611713_49ebf12b1e_o.jpg
164
+ DIS5K/DIS-VD/im/11#Furniture#11#Handrail#3276641240_1b84b5af85_o.jpg
165
+ DIS5K/DIS-VD/im/11#Furniture#13#Ladder#33423266_5391cf47e9_o.jpg
166
+ DIS5K/DIS-VD/im/11#Furniture#17#Table#3725111755_4fc101e7ab_o.jpg
167
+ DIS5K/DIS-VD/im/11#Furniture#2#Bench#35556410400_7235b58070_o.jpg
168
+ DIS5K/DIS-VD/im/11#Furniture#4#Chair#3301769985_e49de6739f_o.jpg
169
+ DIS5K/DIS-VD/im/11#Furniture#6#DentalChair#23811071619_2a95c3a688_o.jpg
170
+ DIS5K/DIS-VD/im/11#Furniture#9#Easel#8322807354_df6d56542e_o.jpg
171
+ DIS5K/DIS-VD/im/13#Insect#10#Mosquito#12391674863_0cdf430d3f_o.jpg
172
+ DIS5K/DIS-VD/im/13#Insect#7#Dragonfly#14693028899_344ea118f2_o.jpg
173
+ DIS5K/DIS-VD/im/14#Kitchenware#10#WineGlass#4450148455_8f460f541a_o.jpg
174
+ DIS5K/DIS-VD/im/14#Kitchenware#3#Hydrovalve#IMG_20210520_203410.jpg
175
+ DIS5K/DIS-VD/im/15#Machine#3#PlowHarrow#34521712846_df4babb024_o.jpg
176
+ DIS5K/DIS-VD/im/16#Music Instrument#5#Trombone#6222242743_e7189405cd_o.jpg
177
+ DIS5K/DIS-VD/im/17#Non-motor Vehicle#12#Wheel#25677578797_ea47e1d9e8_o.jpg
178
+ DIS5K/DIS-VD/im/17#Non-motor Vehicle#2#Bicycle#5153474856_21560b081b_o.jpg
179
+ DIS5K/DIS-VD/im/17#Non-motor Vehicle#7#Mower#16992510572_8a6ff27398_o.jpg
180
+ DIS5K/DIS-VD/im/19#Ship#2#Canoe#40571458163_7faf8b73d9_o.jpg
181
+ DIS5K/DIS-VD/im/2#Aircraft#1#Airplane#4270588164_66a619e834_o.jpg
182
+ DIS5K/DIS-VD/im/2#Aircraft#4#Helicopter#86789665_650b94b2ee_o.jpg
183
+ DIS5K/DIS-VD/im/20#Sports#14#Wakesurfing#5589577652_5061c168d2_o.jpg
184
+ DIS5K/DIS-VD/im/21#Tool#10#Spade#37018312543_63b21b0784_o.jpg
185
+ DIS5K/DIS-VD/im/21#Tool#14#Sword#24789047250_42df9bf422_o.jpg
186
+ DIS5K/DIS-VD/im/21#Tool#18#Umbrella#IMG_20210513_140445.jpg
187
+ DIS5K/DIS-VD/im/21#Tool#6#Key#43939732715_5a6e28b518_o.jpg
188
+ DIS5K/DIS-VD/im/22#Weapon#1#Cannon#12758066705_90b54295e7_o.jpg
189
+ DIS5K/DIS-VD/im/22#Weapon#4#Rifle#8019368790_fb6dc469a7_o.jpg
190
+ DIS5K/DIS-VD/im/3#Aquatic#5#Shrimp#2582833427_7a99e7356e_o.jpg
191
+ DIS5K/DIS-VD/im/4#Architecture#12#Scaffold#1013402687_590750354e_o.jpg
192
+ DIS5K/DIS-VD/im/4#Architecture#13#Sculpture#17176841759_272a3ed6e3_o.jpg
193
+ DIS5K/DIS-VD/im/4#Architecture#14#Stair#15079108505_0d11281624_o.jpg
194
+ DIS5K/DIS-VD/im/4#Architecture#19#Windmill#2928111082_ceb3051c04_o.jpg
195
+ DIS5K/DIS-VD/im/4#Architecture#3#Crack#3551574032_17dd106d31_o.jpg
196
+ DIS5K/DIS-VD/im/4#Architecture#5#GasStation#4564307581_c3069bdc62_o.jpg
197
+ DIS5K/DIS-VD/im/4#Architecture#8#ObservationTower#2704526950_d4f0ddc807_o.jpg
198
+ DIS5K/DIS-VD/im/5#Artifact#3#Handcraft#10873642323_1bafce3aa5_o.jpg
199
+ DIS5K/DIS-VD/im/6#Automobile#11#Tractor#8594504006_0c2c557d85_o.jpg
200
+ DIS5K/DIS-VD/im/8#Electronics#3#Earphone#8106454803_1178d867cc_o.jpg
tools/DepthPro/src/depth_pro/network/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Copyright (C) 2024 Apple Inc. All Rights Reserved.
2
+ """Depth Pro network blocks."""
tools/DepthPro/src/depth_pro/network/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (197 Bytes). View file
 
tools/DepthPro/src/depth_pro/network/__pycache__/decoder.cpython-310.pyc ADDED
Binary file (5.32 kB). View file
 
tools/DepthPro/src/depth_pro/network/__pycache__/encoder.cpython-310.pyc ADDED
Binary file (7.43 kB). View file
 
tools/DepthPro/src/depth_pro/network/__pycache__/fov.cpython-310.pyc ADDED
Binary file (2.09 kB). View file
 
tools/DepthPro/src/depth_pro/network/__pycache__/vit.cpython-310.pyc ADDED
Binary file (2.81 kB). View file
 
tools/DepthPro/src/depth_pro/network/__pycache__/vit_factory.cpython-310.pyc ADDED
Binary file (2.96 kB). View file
 
tools/DepthPro/src/depth_pro/network/decoder.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Copyright (C) 2024 Apple Inc. All Rights Reserved.
2
+
3
+ Dense Prediction Transformer Decoder architecture.
4
+
5
+ Implements a variant of Vision Transformers for Dense Prediction, https://arxiv.org/abs/2103.13413
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Iterable
11
+
12
+ import torch
13
+ from torch import nn
14
+
15
+
16
+ class MultiresConvDecoder(nn.Module):
17
+ """Decoder for multi-resolution encodings."""
18
+
19
+ def __init__(
20
+ self,
21
+ dims_encoder: Iterable[int],
22
+ dim_decoder: int,
23
+ ):
24
+ """Initialize multiresolution convolutional decoder.
25
+
26
+ Args:
27
+ ----
28
+ dims_encoder: Expected dims at each level from the encoder.
29
+ dim_decoder: Dim of decoder features.
30
+
31
+ """
32
+ super().__init__()
33
+ self.dims_encoder = list(dims_encoder)
34
+ self.dim_decoder = dim_decoder
35
+ self.dim_out = dim_decoder
36
+
37
+ num_encoders = len(self.dims_encoder)
38
+
39
+ # At the highest resolution, i.e. level 0, we apply projection w/ 1x1 convolution
40
+ # when the dimensions mismatch. Otherwise we do not do anything, which is
41
+ # the default behavior of monodepth.
42
+ conv0 = (
43
+ nn.Conv2d(self.dims_encoder[0], dim_decoder, kernel_size=1, bias=False)
44
+ if self.dims_encoder[0] != dim_decoder
45
+ else nn.Identity()
46
+ )
47
+
48
+ convs = [conv0]
49
+ for i in range(1, num_encoders):
50
+ convs.append(
51
+ nn.Conv2d(
52
+ self.dims_encoder[i],
53
+ dim_decoder,
54
+ kernel_size=3,
55
+ stride=1,
56
+ padding=1,
57
+ bias=False,
58
+ )
59
+ )
60
+
61
+ self.convs = nn.ModuleList(convs)
62
+
63
+ fusions = []
64
+ for i in range(num_encoders):
65
+ fusions.append(
66
+ FeatureFusionBlock2d(
67
+ num_features=dim_decoder,
68
+ deconv=(i != 0),
69
+ batch_norm=False,
70
+ )
71
+ )
72
+ self.fusions = nn.ModuleList(fusions)
73
+
74
+ def forward(self, encodings: torch.Tensor) -> torch.Tensor:
75
+ """Decode the multi-resolution encodings."""
76
+ num_levels = len(encodings)
77
+ num_encoders = len(self.dims_encoder)
78
+
79
+ if num_levels != num_encoders:
80
+ raise ValueError(
81
+ f"Got encoder output levels={num_levels}, expected levels={num_encoders+1}."
82
+ )
83
+
84
+ # Project features of different encoder dims to the same decoder dim.
85
+ # Fuse features from the lowest resolution (num_levels-1)
86
+ # to the highest (0).
87
+ features = self.convs[-1](encodings[-1])
88
+ lowres_features = features
89
+ features = self.fusions[-1](features)
90
+ for i in range(num_levels - 2, -1, -1):
91
+ features_i = self.convs[i](encodings[i])
92
+ features = self.fusions[i](features, features_i)
93
+ return features, lowres_features
94
+
95
+
96
+ class ResidualBlock(nn.Module):
97
+ """Generic implementation of residual blocks.
98
+
99
+ This implements a generic residual block from
100
+ He et al. - Identity Mappings in Deep Residual Networks (2016),
101
+ https://arxiv.org/abs/1603.05027
102
+ which can be further customized via factory functions.
103
+ """
104
+
105
+ def __init__(self, residual: nn.Module, shortcut: nn.Module | None = None) -> None:
106
+ """Initialize ResidualBlock."""
107
+ super().__init__()
108
+ self.residual = residual
109
+ self.shortcut = shortcut
110
+
111
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
112
+ """Apply residual block."""
113
+ delta_x = self.residual(x)
114
+
115
+ if self.shortcut is not None:
116
+ x = self.shortcut(x)
117
+
118
+ return x + delta_x
119
+
120
+
121
+ class FeatureFusionBlock2d(nn.Module):
122
+ """Feature fusion for DPT."""
123
+
124
+ def __init__(
125
+ self,
126
+ num_features: int,
127
+ deconv: bool = False,
128
+ batch_norm: bool = False,
129
+ ):
130
+ """Initialize feature fusion block.
131
+
132
+ Args:
133
+ ----
134
+ num_features: Input and output dimensions.
135
+ deconv: Whether to use deconv before the final output conv.
136
+ batch_norm: Whether to use batch normalization in resnet blocks.
137
+
138
+ """
139
+ super().__init__()
140
+
141
+ self.resnet1 = self._residual_block(num_features, batch_norm)
142
+ self.resnet2 = self._residual_block(num_features, batch_norm)
143
+
144
+ self.use_deconv = deconv
145
+ if deconv:
146
+ self.deconv = nn.ConvTranspose2d(
147
+ in_channels=num_features,
148
+ out_channels=num_features,
149
+ kernel_size=2,
150
+ stride=2,
151
+ padding=0,
152
+ bias=False,
153
+ )
154
+
155
+ self.out_conv = nn.Conv2d(
156
+ num_features,
157
+ num_features,
158
+ kernel_size=1,
159
+ stride=1,
160
+ padding=0,
161
+ bias=True,
162
+ )
163
+
164
+ self.skip_add = nn.quantized.FloatFunctional()
165
+
166
+ def forward(self, x0: torch.Tensor, x1: torch.Tensor | None = None) -> torch.Tensor:
167
+ """Process and fuse input features."""
168
+ x = x0
169
+
170
+ if x1 is not None:
171
+ res = self.resnet1(x1)
172
+ x = self.skip_add.add(x, res)
173
+
174
+ x = self.resnet2(x)
175
+
176
+ if self.use_deconv:
177
+ x = self.deconv(x)
178
+ x = self.out_conv(x)
179
+
180
+ return x
181
+
182
+ @staticmethod
183
+ def _residual_block(num_features: int, batch_norm: bool):
184
+ """Create a residual block."""
185
+
186
+ def _create_block(dim: int, batch_norm: bool) -> list[nn.Module]:
187
+ layers = [
188
+ nn.ReLU(False),
189
+ nn.Conv2d(
190
+ num_features,
191
+ num_features,
192
+ kernel_size=3,
193
+ stride=1,
194
+ padding=1,
195
+ bias=not batch_norm,
196
+ ),
197
+ ]
198
+ if batch_norm:
199
+ layers.append(nn.BatchNorm2d(dim))
200
+ return layers
201
+
202
+ residual = nn.Sequential(
203
+ *_create_block(dim=num_features, batch_norm=batch_norm),
204
+ *_create_block(dim=num_features, batch_norm=batch_norm),
205
+ )
206
+ return ResidualBlock(residual)
tools/DepthPro/src/depth_pro/network/encoder.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2024 Apple Inc. All Rights Reserved.
2
+ # DepthProEncoder combining patch and image encoders.
3
+
4
+ from __future__ import annotations
5
+
6
+ import math
7
+ from typing import Iterable, Optional
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+
13
+
14
+ class DepthProEncoder(nn.Module):
15
+ """DepthPro Encoder.
16
+
17
+ An encoder aimed at creating multi-resolution encodings from Vision Transformers.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ dims_encoder: Iterable[int],
23
+ patch_encoder: nn.Module,
24
+ image_encoder: nn.Module,
25
+ hook_block_ids: Iterable[int],
26
+ decoder_features: int,
27
+ ):
28
+ """Initialize DepthProEncoder.
29
+
30
+ The framework
31
+ 1. creates an image pyramid,
32
+ 2. generates overlapping patches with a sliding window at each pyramid level,
33
+ 3. creates batched encodings via vision transformer backbones,
34
+ 4. produces multi-resolution encodings.
35
+
36
+ Args:
37
+ ----
38
+ img_size: Backbone image resolution.
39
+ dims_encoder: Dimensions of the encoder at different layers.
40
+ patch_encoder: Backbone used for patches.
41
+ image_encoder: Backbone used for global image encoder.
42
+ hook_block_ids: Hooks to obtain intermediate features for the patch encoder model.
43
+ decoder_features: Number of feature output in the decoder.
44
+
45
+ """
46
+ super().__init__()
47
+
48
+ self.dims_encoder = list(dims_encoder)
49
+ self.patch_encoder = patch_encoder
50
+ self.image_encoder = image_encoder
51
+ self.hook_block_ids = list(hook_block_ids)
52
+
53
+ patch_encoder_embed_dim = patch_encoder.embed_dim
54
+ image_encoder_embed_dim = image_encoder.embed_dim
55
+
56
+ self.out_size = int(
57
+ patch_encoder.patch_embed.img_size[0] // patch_encoder.patch_embed.patch_size[0]
58
+ )
59
+
60
+ def _create_project_upsample_block(
61
+ dim_in: int,
62
+ dim_out: int,
63
+ upsample_layers: int,
64
+ dim_int: Optional[int] = None,
65
+ ) -> nn.Module:
66
+ if dim_int is None:
67
+ dim_int = dim_out
68
+ # Projection.
69
+ blocks = [
70
+ nn.Conv2d(
71
+ in_channels=dim_in,
72
+ out_channels=dim_int,
73
+ kernel_size=1,
74
+ stride=1,
75
+ padding=0,
76
+ bias=False,
77
+ )
78
+ ]
79
+
80
+ # Upsampling.
81
+ blocks += [
82
+ nn.ConvTranspose2d(
83
+ in_channels=dim_int if i == 0 else dim_out,
84
+ out_channels=dim_out,
85
+ kernel_size=2,
86
+ stride=2,
87
+ padding=0,
88
+ bias=False,
89
+ )
90
+ for i in range(upsample_layers)
91
+ ]
92
+
93
+ return nn.Sequential(*blocks)
94
+
95
+ self.upsample_latent0 = _create_project_upsample_block(
96
+ dim_in=patch_encoder_embed_dim,
97
+ dim_int=self.dims_encoder[0],
98
+ dim_out=decoder_features,
99
+ upsample_layers=3,
100
+ )
101
+ self.upsample_latent1 = _create_project_upsample_block(
102
+ dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[0], upsample_layers=2
103
+ )
104
+
105
+ self.upsample0 = _create_project_upsample_block(
106
+ dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[1], upsample_layers=1
107
+ )
108
+ self.upsample1 = _create_project_upsample_block(
109
+ dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[2], upsample_layers=1
110
+ )
111
+ self.upsample2 = _create_project_upsample_block(
112
+ dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[3], upsample_layers=1
113
+ )
114
+
115
+ self.upsample_lowres = nn.ConvTranspose2d(
116
+ in_channels=image_encoder_embed_dim,
117
+ out_channels=self.dims_encoder[3],
118
+ kernel_size=2,
119
+ stride=2,
120
+ padding=0,
121
+ bias=True,
122
+ )
123
+ self.fuse_lowres = nn.Conv2d(
124
+ in_channels=(self.dims_encoder[3] + self.dims_encoder[3]),
125
+ out_channels=self.dims_encoder[3],
126
+ kernel_size=1,
127
+ stride=1,
128
+ padding=0,
129
+ bias=True,
130
+ )
131
+
132
+ # Obtain intermediate outputs of the blocks.
133
+ self.patch_encoder.blocks[self.hook_block_ids[0]].register_forward_hook(
134
+ self._hook0
135
+ )
136
+ self.patch_encoder.blocks[self.hook_block_ids[1]].register_forward_hook(
137
+ self._hook1
138
+ )
139
+
140
+ def _hook0(self, model, input, output):
141
+ self.backbone_highres_hook0 = output
142
+
143
+ def _hook1(self, model, input, output):
144
+ self.backbone_highres_hook1 = output
145
+
146
+ @property
147
+ def img_size(self) -> int:
148
+ """Return the full image size of the SPN network."""
149
+ return self.patch_encoder.patch_embed.img_size[0] * 4
150
+
151
+ def _create_pyramid(
152
+ self, x: torch.Tensor
153
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
154
+ """Create a 3-level image pyramid."""
155
+ # Original resolution: 1536 by default.
156
+ x0 = x
157
+
158
+ # Middle resolution: 768 by default.
159
+ x1 = F.interpolate(
160
+ x, size=None, scale_factor=0.5, mode="bilinear", align_corners=False
161
+ )
162
+
163
+ # Low resolution: 384 by default, corresponding to the backbone resolution.
164
+ x2 = F.interpolate(
165
+ x, size=None, scale_factor=0.25, mode="bilinear", align_corners=False
166
+ )
167
+
168
+ return x0, x1, x2
169
+
170
+ def split(self, x: torch.Tensor, overlap_ratio: float = 0.25) -> torch.Tensor:
171
+ """Split the input into small patches with sliding window."""
172
+ patch_size = 384
173
+ patch_stride = int(patch_size * (1 - overlap_ratio))
174
+
175
+ image_size = x.shape[-1]
176
+ steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1
177
+
178
+ x_patch_list = []
179
+ for j in range(steps):
180
+ j0 = j * patch_stride
181
+ j1 = j0 + patch_size
182
+
183
+ for i in range(steps):
184
+ i0 = i * patch_stride
185
+ i1 = i0 + patch_size
186
+ x_patch_list.append(x[..., j0:j1, i0:i1])
187
+
188
+ return torch.cat(x_patch_list, dim=0)
189
+
190
+ def merge(self, x: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor:
191
+ """Merge the patched input into a image with sliding window."""
192
+ steps = int(math.sqrt(x.shape[0] // batch_size))
193
+
194
+ idx = 0
195
+
196
+ output_list = []
197
+ for j in range(steps):
198
+ output_row_list = []
199
+ for i in range(steps):
200
+ output = x[batch_size * idx : batch_size * (idx + 1)]
201
+
202
+ if j != 0:
203
+ output = output[..., padding:, :]
204
+ if i != 0:
205
+ output = output[..., :, padding:]
206
+ if j != steps - 1:
207
+ output = output[..., :-padding, :]
208
+ if i != steps - 1:
209
+ output = output[..., :, :-padding]
210
+
211
+ output_row_list.append(output)
212
+ idx += 1
213
+
214
+ output_row = torch.cat(output_row_list, dim=-1)
215
+ output_list.append(output_row)
216
+ output = torch.cat(output_list, dim=-2)
217
+ return output
218
+
219
+ def reshape_feature(
220
+ self, embeddings: torch.Tensor, width, height, cls_token_offset=1
221
+ ):
222
+ """Discard class token and reshape 1D feature map to a 2D grid."""
223
+ b, hw, c = embeddings.shape
224
+
225
+ # Remove class token.
226
+ if cls_token_offset > 0:
227
+ embeddings = embeddings[:, cls_token_offset:, :]
228
+
229
+ # Shape: (batch, height, width, dim) -> (batch, dim, height, width)
230
+ embeddings = embeddings.reshape(b, height, width, c).permute(0, 3, 1, 2)
231
+ return embeddings
232
+
233
+ def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
234
+ """Encode input at multiple resolutions.
235
+
236
+ Args:
237
+ ----
238
+ x (torch.Tensor): Input image.
239
+
240
+ Returns:
241
+ -------
242
+ Multi resolution encoded features.
243
+
244
+ """
245
+ batch_size = x.shape[0]
246
+
247
+ # Step 0: create a 3-level image pyramid.
248
+ x0, x1, x2 = self._create_pyramid(x)
249
+
250
+ # Step 1: split to create batched overlapped mini-images at the backbone (BeiT/ViT/Dino)
251
+ # resolution.
252
+ # 5x5 @ 384x384 at the highest resolution (1536x1536).
253
+ x0_patches = self.split(x0, overlap_ratio=0.25)
254
+ # 3x3 @ 384x384 at the middle resolution (768x768).
255
+ x1_patches = self.split(x1, overlap_ratio=0.5)
256
+ # 1x1 # 384x384 at the lowest resolution (384x384).
257
+ x2_patches = x2
258
+
259
+ # Concatenate all the sliding window patches and form a batch of size (35=5x5+3x3+1x1).
260
+ x_pyramid_patches = torch.cat(
261
+ (x0_patches, x1_patches, x2_patches),
262
+ dim=0,
263
+ )
264
+
265
+ # Step 2: Run the backbone (BeiT) model and get the result of large batch size.
266
+ x_pyramid_encodings = self.patch_encoder(x_pyramid_patches)
267
+ x_pyramid_encodings = self.reshape_feature(
268
+ x_pyramid_encodings, self.out_size, self.out_size
269
+ )
270
+
271
+ # Step 3: merging.
272
+ # Merge highres latent encoding.
273
+ x_latent0_encodings = self.reshape_feature(
274
+ self.backbone_highres_hook0,
275
+ self.out_size,
276
+ self.out_size,
277
+ )
278
+ x_latent0_features = self.merge(
279
+ x_latent0_encodings[: batch_size * 5 * 5], batch_size=batch_size, padding=3
280
+ )
281
+
282
+ x_latent1_encodings = self.reshape_feature(
283
+ self.backbone_highres_hook1,
284
+ self.out_size,
285
+ self.out_size,
286
+ )
287
+ x_latent1_features = self.merge(
288
+ x_latent1_encodings[: batch_size * 5 * 5], batch_size=batch_size, padding=3
289
+ )
290
+
291
+ # Split the 35 batch size from pyramid encoding back into 5x5+3x3+1x1.
292
+ x0_encodings, x1_encodings, x2_encodings = torch.split(
293
+ x_pyramid_encodings,
294
+ [len(x0_patches), len(x1_patches), len(x2_patches)],
295
+ dim=0,
296
+ )
297
+
298
+ # 96x96 feature maps by merging 5x5 @ 24x24 patches with overlaps.
299
+ x0_features = self.merge(x0_encodings, batch_size=batch_size, padding=3)
300
+
301
+ # 48x84 feature maps by merging 3x3 @ 24x24 patches with overlaps.
302
+ x1_features = self.merge(x1_encodings, batch_size=batch_size, padding=6)
303
+
304
+ # 24x24 feature maps.
305
+ x2_features = x2_encodings
306
+
307
+ # Apply the image encoder model.
308
+ x_global_features = self.image_encoder(x2_patches)
309
+ x_global_features = self.reshape_feature(
310
+ x_global_features, self.out_size, self.out_size
311
+ )
312
+
313
+ # Upsample feature maps.
314
+ x_latent0_features = self.upsample_latent0(x_latent0_features)
315
+ x_latent1_features = self.upsample_latent1(x_latent1_features)
316
+
317
+ x0_features = self.upsample0(x0_features)
318
+ x1_features = self.upsample1(x1_features)
319
+ x2_features = self.upsample2(x2_features)
320
+
321
+ x_global_features = self.upsample_lowres(x_global_features)
322
+ x_global_features = self.fuse_lowres(
323
+ torch.cat((x2_features, x_global_features), dim=1)
324
+ )
325
+
326
+ return [
327
+ x_latent0_features,
328
+ x_latent1_features,
329
+ x0_features,
330
+ x1_features,
331
+ x_global_features,
332
+ ]
tools/DepthPro/src/depth_pro/network/fov.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2024 Apple Inc. All Rights Reserved.
2
+ # Field of View network architecture.
3
+
4
+ from typing import Optional
5
+
6
+ import torch
7
+ from torch import nn
8
+ from torch.nn import functional as F
9
+
10
+
11
+ class FOVNetwork(nn.Module):
12
+ """Field of View estimation network."""
13
+
14
+ def __init__(
15
+ self,
16
+ num_features: int,
17
+ fov_encoder: Optional[nn.Module] = None,
18
+ ):
19
+ """Initialize the Field of View estimation block.
20
+
21
+ Args:
22
+ ----
23
+ num_features: Number of features used.
24
+ fov_encoder: Optional encoder to bring additional network capacity.
25
+
26
+ """
27
+ super().__init__()
28
+
29
+ # Create FOV head.
30
+ fov_head0 = [
31
+ nn.Conv2d(
32
+ num_features, num_features // 2, kernel_size=3, stride=2, padding=1
33
+ ), # 128 x 24 x 24
34
+ nn.ReLU(True),
35
+ ]
36
+ fov_head = [
37
+ nn.Conv2d(
38
+ num_features // 2, num_features // 4, kernel_size=3, stride=2, padding=1
39
+ ), # 64 x 12 x 12
40
+ nn.ReLU(True),
41
+ nn.Conv2d(
42
+ num_features // 4, num_features // 8, kernel_size=3, stride=2, padding=1
43
+ ), # 32 x 6 x 6
44
+ nn.ReLU(True),
45
+ nn.Conv2d(num_features // 8, 1, kernel_size=6, stride=1, padding=0),
46
+ ]
47
+ if fov_encoder is not None:
48
+ self.encoder = nn.Sequential(
49
+ fov_encoder, nn.Linear(fov_encoder.embed_dim, num_features // 2)
50
+ )
51
+ self.downsample = nn.Sequential(*fov_head0)
52
+ else:
53
+ fov_head = fov_head0 + fov_head
54
+ self.head = nn.Sequential(*fov_head)
55
+
56
+ def forward(self, x: torch.Tensor, lowres_feature: torch.Tensor) -> torch.Tensor:
57
+ """Forward the fov network.
58
+
59
+ Args:
60
+ ----
61
+ x (torch.Tensor): Input image.
62
+ lowres_feature (torch.Tensor): Low resolution feature.
63
+
64
+ Returns:
65
+ -------
66
+ The field of view tensor.
67
+
68
+ """
69
+ if hasattr(self, "encoder"):
70
+ x = F.interpolate(
71
+ x,
72
+ size=None,
73
+ scale_factor=0.25,
74
+ mode="bilinear",
75
+ align_corners=False,
76
+ )
77
+ x = self.encoder(x)[:, 1:].permute(0, 2, 1)
78
+ lowres_feature = self.downsample(lowres_feature)
79
+ x = x.reshape_as(lowres_feature) + lowres_feature
80
+ else:
81
+ x = lowres_feature
82
+ return self.head(x)