Spaces:

hpwang
/

VistaDream

Build error

App Files Files Community

hpwang commited on Nov 29, 2024

Commit

fd5e0f7

1 Parent(s): 04a2f88

[Init]

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +106 -0
ops/__init__.py +0 -0
ops/connect.py +113 -0
ops/depth_pro.py +13 -0
ops/eval.py +51 -0
ops/fooocus.py +6 -0
ops/gs/__init__.py +0 -0
ops/gs/basic.py +296 -0
ops/gs/sh_utils.py +96 -0
ops/gs/train.py +92 -0
ops/llava.py +31 -0
ops/mcs.py +121 -0
ops/sky.py +22 -0
ops/trajs/__init__.py +53 -0
ops/trajs/spiral.py +46 -0
ops/utils.py +381 -0
ops/visual_check.py +64 -0
pipe/__init__.py +0 -0
pipe/c2f_recons.py +211 -0
pipe/cfgs/INSTRUCT.md +18 -0
pipe/cfgs/__init__.py +8 -0
pipe/cfgs/basic.yaml +47 -0
pipe/lvm_inpaint.py +85 -0
pipe/reconstruct.py +52 -0
pipe/refine_mvdps.py +155 -0
requirements.txt +28 -0
tools/DepthPro/ACKNOWLEDGEMENTS.md +418 -0
tools/DepthPro/CODE_OF_CONDUCT.md +71 -0
tools/DepthPro/CONTRIBUTING.md +11 -0
tools/DepthPro/LICENSE +47 -0
tools/DepthPro/README.md +97 -0
tools/DepthPro/command_pro_dpt.py +54 -0
tools/DepthPro/get_pretrained_models.sh +8 -0
tools/DepthPro/pyproject.toml +59 -0
tools/DepthPro/src/depth_pro/__init__.py +5 -0
tools/DepthPro/src/depth_pro/cli/__init__.py +4 -0
tools/DepthPro/src/depth_pro/cli/run.py +154 -0
tools/DepthPro/src/depth_pro/depth_pro.py +298 -0
tools/DepthPro/src/depth_pro/eval/boundary_metrics.py +332 -0
tools/DepthPro/src/depth_pro/eval/dis5k_sample_list.txt +200 -0
tools/DepthPro/src/depth_pro/network/__init__.py +2 -0
tools/DepthPro/src/depth_pro/network/__pycache__/__init__.cpython-310.pyc +0 -0
tools/DepthPro/src/depth_pro/network/__pycache__/decoder.cpython-310.pyc +0 -0
tools/DepthPro/src/depth_pro/network/__pycache__/encoder.cpython-310.pyc +0 -0
tools/DepthPro/src/depth_pro/network/__pycache__/fov.cpython-310.pyc +0 -0
tools/DepthPro/src/depth_pro/network/__pycache__/vit.cpython-310.pyc +0 -0
tools/DepthPro/src/depth_pro/network/__pycache__/vit_factory.cpython-310.pyc +0 -0
tools/DepthPro/src/depth_pro/network/decoder.py +206 -0
tools/DepthPro/src/depth_pro/network/encoder.py +332 -0
tools/DepthPro/src/depth_pro/network/fov.py +82 -0

app.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import os
+import torch
+import gradio as gr
+from PIL import Image
+from pipe.cfgs import load_cfg
+from pipe.c2f_recons import Pipeline
+from ops.gs.basic import Gaussian_Scene
+from datetime import datetime
+cfg = load_cfg(f'pipe/cfgs/basic.yaml')
+vistadream = Pipeline(cfg)
+from ops.visual_check import Check
+checkor = Check()
+def get_temp_path():
+    if not os.path.exists('data/gradio_temp'):os.makedirs('data/gradio_temp')
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_path = f"data/gradio_temp/{timestamp}/"
+    return output_path
+def scene_generate(rgb,num_coarse_views,num_mcs_views,mcs_rect_w,mcs_steps):
+    # coarse
+    vistadream.scene = Gaussian_Scene(cfg)
+    # for trajectory genearation
+    vistadream.traj_type = 'spiral'
+    vistadream.scene.traj_type = 'spiral'
+    vistadream.n_sample = num_coarse_views
+    # for scene generation
+    vistadream.opt_iters_per_frame = 512
+    vistadream.outpaint_extend_times = 0.45 #outpaint_extend_times
+    vistadream.outpaint_selections = ['Left','Right','Top','Bottom']
+    # for scene refinement
+    vistadream.mcs_n_view = num_mcs_views
+    vistadream.mcs_rect_w = mcs_rect_w
+    vistadream.mcs_iterations = mcs_steps
+    # coarse scene
+    vistadream._coarse_scene(rgb)
+    torch.cuda.empty_cache()
+    # refinement
+    vistadream._MCS_Refinement()
+    output_path = get_temp_path()
+    torch.cuda.empty_cache()
+    torch.save(vistadream.scene,output_path+'scene.pth')
+    return output_path
+def render_video(output_path):
+    scene = vistadream.scene
+    vistadream.checkor._render_video(scene,save_dir=output_path+'.')
+    return output_path+'video_rgb.mp4',output_path+'video_dpt.mp4'
+def process(rgb,num_coarse_views,num_mcs_views,mcs_rect_w,mcs_steps):
+    path = scene_generate(rgb,num_coarse_views,num_mcs_views,mcs_rect_w,mcs_steps)
+    return render_video(path)
+with gr.Blocks(analytics_enabled=False) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("## VistaDream")
+        gr.Markdown("### Sampling multiview consistent images for single-view scene reconstruction")
+        gr.HTML("""
+        <div style="display:flex;column-gap:4px;">
+            <a href="https://github.com/WHU-USI3DV/VistaDream">
+                <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+            </a>
+            <a href="https://vistadream-project-page.github.io/">
+                <img src='https://img.shields.io/badge/Project-Page-green'>
+            </a>
+			<a href="https://arxiv.org/abs/2410.16892">
+                <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
+            </a>
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(type="pil")
+                run_button = gr.Button("Run")
+                with gr.Accordion("Advanced options", open=False):
+                    num_coarse_views = gr.Slider(label="Coarse-Expand", minimum=5, maximum=25, value=10, step=1)
+                    num_mcs_views = gr.Slider(label="MCS Optimization Views", minimum=4, maximum=10, value=8, step=1)
+                    mcs_rect_w = gr.Slider(label="MCS Rectification Weight", minimum=0.3, maximum=0.8, value=0.7, step=0.1)
+                    mcs_steps = gr.Slider(label="MCS Steps", minimum=8, maximum=15, value=10, step=1)
+            with gr.Column():
+                with gr.Row():
+                    with gr.Column():
+                        rgb_video = gr.Video("Output RGB renderings")
+                    with gr.Column():
+                        dpt_video = gr.Video("Output DPT renderings")
+                examples = gr.Examples(
+                examples = [
+                        ['',
+                         '',
+                         ''],
+                        ['',
+                         '',
+                         ''],
+                        ['',
+                         '',
+                         '']
+                    ],
+                    inputs=[input_image,rgb_video,dpt_video]
+                )
+    ips = [input_image,num_coarse_views,num_mcs_views,mcs_rect_w,mcs_steps]
+    run_button.click(fn=process, inputs=ips, outputs=[rgb_video,dpt_video])
+demo.launch(server_name='0.0.0.0')

ops/__init__.py ADDED Viewed

File without changes

ops/connect.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import cv2
+import numpy as np
+from copy import deepcopy
+from ops.utils import dpt2xyz,transform_points
+class Connect_Tool():
+    def __init__(self) -> None:
+        pass
+    def _align_scale_shift_numpy(self, pred: np.array, target: np.array):
+        mask = (target > 0) & (pred < 199)
+        target_mask = target[mask]
+        pred_mask = pred[mask]
+        if np.sum(mask) > 10:
+            scale, shift = np.polyfit(pred_mask, target_mask, deg=1)
+            if scale < 0:
+                scale = np.median(target[mask]) / (np.median(pred[mask]) + 1e-8)
+                shift = 0
+        else:
+            scale = 1
+            shift = 0
+        return scale,shift
+    def __call__(self, render_dpt, inpaint_dpt, inpaint_msk):
+        if np.sum(inpaint_msk > 0.5) < 1.: return render_dpt
+        # get areas need to be aligned
+        render_dpt_valid  = render_dpt[~inpaint_msk]
+        inpaint_dpt_valid = inpaint_dpt[~inpaint_msk]
+        # rectify
+        scale,shift = self._align_scale_shift_numpy(inpaint_dpt_valid,render_dpt_valid)
+        inpaint_dpt = inpaint_dpt*scale + shift
+        return inpaint_dpt
+class Smooth_Connect_Tool():
+    def __init__(self) -> None:
+        self.coarse_align = Connect_Tool()
+    def _coarse_alignment(self, render_dpt, ipaint_dpt, ipaint_msk):
+        # determine the scale and shift of inpaint_dpt to coarsely align it to render_dpt
+        inpaint_dpt = self.coarse_align(render_dpt,ipaint_dpt,ipaint_msk)
+        return inpaint_dpt
+    def _refine_movements(self, render_dpt, ipaint_dpt, ipaint_msk):
+        '''
+        Follow https://arxiv.org/pdf/2311.13384
+        '''
+        # Determine the adjustment of un-inpainted area
+        ipaint_msk = ipaint_msk>.5
+        H, W = ipaint_msk.shape[0:2]
+        U = np.arange(W)[None,:].repeat(H,axis=0)
+        V = np.arange(H)[:,None].repeat(W,axis=1)
+        # on kept areas
+        keep_render_dpt = render_dpt[~ipaint_msk]
+        keep_ipaint_dpt = ipaint_dpt[~ipaint_msk]
+        keep_adjust_dpt = keep_render_dpt - keep_ipaint_dpt
+        # iterative refinement
+        complete_adjust = np.zeros_like(ipaint_dpt)
+        for i in range(100):
+            complete_adjust[~ipaint_msk] = keep_adjust_dpt
+            complete_adjust = cv2.blur(complete_adjust,(15,15))
+        # complete_adjust[~ipaint_msk] = keep_adjust_dpt
+        ipaint_dpt = ipaint_dpt + complete_adjust
+        return ipaint_dpt
+    def _affine_dpt_to_GS(self, render_dpt, inpaint_dpt, inpaint_msk):
+        if np.sum(inpaint_msk > 0.5) < 1.: return render_dpt
+        inpaint_dpt = self._coarse_alignment(render_dpt,inpaint_dpt,inpaint_msk)
+        inpaint_dpt = self._refine_movements(render_dpt,inpaint_dpt,inpaint_msk)
+        return inpaint_dpt
+    def _scale_dpt_to_GS(self, render_dpt, inpaint_dpt, inpaint_msk):
+        if np.sum(inpaint_msk > 0.5) < 1.: return render_dpt
+        inpaint_dpt = self._refine_movements(render_dpt,inpaint_dpt,inpaint_msk)
+        return inpaint_dpt
+class Occlusion_Removal():
+    def __init__(self) -> None:
+        pass
+    def __call__(self,scene,frame):
+        # first get xyz of the newly added frame
+        xyz = dpt2xyz(frame.dpt,frame.intrinsic)
+        # we only check newly added areas
+        xyz = xyz[frame.inpaint]
+        # move these xyzs to world coor system
+        inv_extrinsic = np.linalg.inv(frame.extrinsic)
+        xyz = transform_points(xyz,inv_extrinsic)
+        # we will add which pixels to the gaussian scene
+        msk = np.ones_like(xyz[...,0])
+        # project the xyzs to already built frames
+        for former_frame in scene.frames:
+            # xyz in camera frustrum
+            xyz_camera = transform_points(deepcopy(xyz),former_frame.extrinsic)
+            # uvz in camera frustrum
+            uvz_camera = np.einsum(f'ab,pb->pa',former_frame.intrinsic,xyz_camera)
+            # uv and d in camra frustrum
+            uv,d = uvz_camera[...,:2]/uvz_camera[...,-1:], uvz_camera[...,-1]
+            # in-frusturm pixels
+            valid_msk = (uv[...,0]>0) & (uv[...,0]<former_frame.W) & (uv[...,1]>0) & (uv[...,1]<former_frame.H) & (d>1e-2)
+            valid_idx = np.where(valid_msk)[0]
+            uv,d = uv[valid_idx].astype(np.uint32),d[valid_idx]
+            # make comparsion: compare_d < d is ok -- compare_d - d < 0(or a small number)
+            compare_d = former_frame.dpt[uv[:,1],uv[:,0]]
+            remove_msk = (compare_d-d)>(d+compare_d)/2./15.
+            # else to unvalid pixels
+            invalid_idx = valid_idx[remove_msk]
+            msk[invalid_idx] = 0.
+        # USE indexes rather than [][]
+        inpaint_idx_v,inpaint_idx_u = np.where(frame.inpaint)
+        inpaint_idx_v = inpaint_idx_v[msk<.5]
+        inpaint_idx_u = inpaint_idx_u[msk<.5]
+        frame.inpaint[inpaint_idx_v,inpaint_idx_u] = False
+        return frame

ops/depth_pro.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os,sys
+currect = os.getcwd()
+reference = f'{currect}/tools/DepthPro'
+sys.path.append(reference)
+from command_pro_dpt import apple_pro_depth
+class Depth_Pro_Tool(apple_pro_depth):
+    def __init__(self, device='cuda', ckpt='/mnt/proj/SOTAs/ml-depth-pro-main/checkpoints/depth_pro.pt'):
+        super().__init__(device, ckpt)
+    def __call__(self, image, f_px=None):
+        return super().__call__(image, f_px)

ops/eval.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import cv2
+from PIL import Image
+import numpy as np
+from tqdm import tqdm
+from ops.llava import Llava
+class llava_iqa():
+    def __init__(self) -> None:
+        self._questions()
+        self.llava = Llava(device='cuda')
+    def _questions(self):
+        # quailty, noise, structure, texture
+        self.questions = {'noise-free':'Is the image free of noise or distortion',
+        'sharp':'Does the image show clear objects and sharp edges',
+        'structure':'Is the overall scene coherent and realistic in terms of layout and proportions in this image',
+        'detail':'Does this image show detailed textures and materials',
+        'quality':'Is this image overall a high quality image with clear objects, sharp edges, nice color, good overall structure, and good visual quailty'}
+    def _load_renderings(self,video_fn):
+        capturer = cv2.VideoCapture(video_fn)
+        frames = []
+        while True:
+            ret,frame = capturer.read()
+            if ret == False or frame is None: break
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = Image.fromarray(frame.astype(np.uint8))
+            frames.append(frame)
+        # random sample...
+        idxs = np.random.permutation(len(frames))[0:50]
+        frames = [frames[i] for i in idxs]
+        return frames
+    def __call__(self,video_fn=f'data/vistadream/bust/video_rgb.mp4'):
+        results = {}
+        renderings = self._load_renderings(video_fn)
+        for key,question in self.questions.items():
+            results[key] = []
+            query = f'<image>\n USER: {question}, just anwser with yes or no? \n ASSISTANT: '
+            for rendering in renderings:
+                prompt = self.llava(rendering,query)
+                split  = str.rfind(prompt,'ASSISTANT: ') + len(f'ASSISTANT: ')
+                prompt = prompt[split+1:]
+                if prompt[0:2] == 'Ye': results[key].append(1)
+                else: results[key].append(0)
+        for key,val in results.items:
+            results[key] = np.mean(np.array(val))
+        return results

ops/fooocus.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import os,sys
+currect = os.getcwd()
+reference = f'{currect}/tools/Fooocus'
+sys.path.insert(0,reference)
+from fooocus_command import Fooocus

ops/gs/__init__.py ADDED Viewed

File without changes

ops/gs/basic.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import PIL
+import torch
+import numpy as np
+import gsplat as gs
+import torch.nn as nn
+from copy import deepcopy
+import torch.nn.functional as F
+from dataclasses import dataclass
+from ops.utils import (
+    dpt2xyz,
+    alpha_inpaint_mask,
+    transform_points,
+    numpy_normalize,
+    numpy_quaternion_from_matrix
+)
+@dataclass
+class Frame():
+    '''
+    rgb: in shape of H*W*3, in range of 0-1
+    dpt: in shape of H*W, real depth
+    inpaint: bool mask in shape of H*W for inpainting
+    intrinsic: 3*3
+    extrinsic: array in shape of 4*4
+    As a class for:
+    initialize camera
+    accept rendering result
+    accept inpainting result
+    All at 2D-domain
+    '''
+    def __init__(self,
+                 H: int = None,
+                 W: int = None,
+                 rgb: np.array = None,
+                 dpt: np.array = None,
+                 sky: np.array = None,
+                 inpaint: np.array = None,
+                 intrinsic: np.array = None,
+                 extrinsic: np.array = None,
+                 # detailed target
+                 ideal_dpt: np.array = None,
+                 ideal_nml: np.array = None,
+                 prompt: str = None) -> None:
+        self.H = H
+        self.W = W
+        self.rgb = rgb
+        self.dpt = dpt
+        self.sky = sky
+        self.prompt = prompt
+        self.intrinsic = intrinsic
+        self.extrinsic = extrinsic
+        self._rgb_rect()
+        self._extr_rect()
+        # for inpainting
+        self.inpaint = inpaint
+        self.inpaint_wo_edge = inpaint
+        # for supervision
+        self.ideal_dpt = ideal_dpt
+        self.ideal_nml = ideal_nml
+    def _rgb_rect(self):
+        if self.rgb is not None:
+            if isinstance(self.rgb, PIL.PngImagePlugin.PngImageFile):
+                self.rgb = np.array(self.rgb)
+            if isinstance(self.rgb, PIL.JpegImagePlugin.JpegImageFile):
+                self.rgb = np.array(self.rgb)
+            if np.amax(self.rgb) > 1.1:
+                self.rgb = self.rgb / 255
+    def _extr_rect(self):
+        if self.extrinsic is None: self.extrinsic = np.eye(4)
+        self.inv_extrinsic = np.linalg.inv(self.extrinsic)
+@dataclass
+class Gaussian_Frame():
+    '''
+    In-frame-frustrum
+    Gaussians from a single RGBD frame
+    As a class for:
+    accept information from initialized/inpainting+geo-estimated frame
+    saving pixelsplat properties including rgb, xyz, scale, rotation, opacity; note here, we made a modification to xyz;
+    we first project depth to xyz
+    then we tune a scale map(initialized to ones) and a shift map(initialized to zeros), they are optimized and add to the original xyz when rendering
+    '''
+    # as pixelsplat guassian
+    rgb:       torch.Tensor = None,
+    scale:     torch.Tensor = None,
+    opacity:   torch.Tensor = None,
+    rotation:  torch.Tensor = None,
+    # gaussian center
+    dpt:       torch.Tensor = None,
+    xyz:       torch.Tensor = None,
+    # as a frame
+    H:         int = 480,
+    W:         int = 640,
+    def __init__(self, frame: Frame, device = 'cuda'):
+        '''after inpainting'''
+        # de-active functions
+        self.rgbs_deact    = torch.logit
+        self.scales_deact  = torch.log
+        self.opacity_deact = torch.logit
+        self.device =  device
+        # for gaussian initialization
+        self._set_property_from_frame(frame)
+    def _to_3d(self):
+        # inv intrinsic
+        xyz = dpt2xyz(self.dpt,self.intrinsic)
+        inv_extrinsic = np.linalg.inv(self.extrinsic)
+        xyz = transform_points(xyz,inv_extrinsic)
+        return xyz
+    def _paint_filter(self,paint_mask):
+        if np.sum(paint_mask)<3:
+            paint_mask = np.zeros((self.H,self.W))
+            paint_mask[0:1] = 1
+            paint_mask = paint_mask>.5
+        self.rgb = self.rgb[paint_mask]
+        self.xyz = self.xyz[paint_mask]
+        self.scale = self.scale[paint_mask]
+        self.opacity = self.opacity[paint_mask]
+        self.rotation = self.rotation[paint_mask]
+    def _to_cuda(self):
+        self.rgb        = torch.from_numpy(self.rgb.astype(np.float32)).to(self.device)
+        self.xyz        = torch.from_numpy(self.xyz.astype(np.float32)).to(self.device)
+        self.scale      = torch.from_numpy(self.scale.astype(np.float32)).to(self.device)
+        self.opacity    = torch.from_numpy(self.opacity.astype(np.float32)).to(self.device)
+        self.rotation   = torch.from_numpy(self.rotation.astype(np.float32)).to(self.device)
+    def _fine_init_scale_rotations(self):
+        # from https://arxiv.org/pdf/2406.09394
+        """ Compute rotation matrices that align z-axis with given normal vectors using matrix operations. """
+        up_axis = np.array([0,1,0])
+        nml = self.nml @ self.extrinsic[0:3,0:3]
+        qz = numpy_normalize(nml)
+        qx = np.cross(up_axis,qz)
+        qx = numpy_normalize(qx)
+        qy = np.cross(qz,qx)
+        qy = numpy_normalize(qy)
+        rot = np.concatenate([qx[...,None],qy[...,None],qz[...,None]],axis=-1)
+        self.rotation = numpy_quaternion_from_matrix(rot)
+        # scale
+        safe_nml = deepcopy(self.nml)
+        safe_nml[safe_nml[:,:,-1]<0.2,-1] = .2
+        normal_xoz = deepcopy(safe_nml)
+        normal_yoz = deepcopy(safe_nml)
+        normal_xoz[...,1] = 0.
+        normal_yoz[...,0] = 0.
+        normal_xoz = numpy_normalize(normal_xoz)
+        normal_yoz = numpy_normalize(normal_yoz)
+        cos_theta_x = np.abs(normal_xoz[...,2])
+        cos_theta_y = np.abs(normal_yoz[...,2])
+        scale_basic = self.dpt / self.intrinsic[0,0] / np.sqrt(2)
+        scale_x = scale_basic / cos_theta_x
+        scale_y = scale_basic / cos_theta_y
+        scale_z = (scale_x + scale_y) / 10.
+        self.scale = np.concatenate([scale_x[...,None],
+                                     scale_y[...,None],
+                                     scale_z[...,None]],axis=-1)
+    def _coarse_init_scale_rotations(self):
+        # gaussian property -- HW3 scale
+        self.scale = self.dpt / self.intrinsic[0,0] / np.sqrt(2)
+        self.scale = self.scale[:,:,None].repeat(3,-1)
+        # gaussian property -- HW4 rotation
+        self.rotation = np.zeros((self.H,self.W,4))
+        self.rotation[:,:,0] = 1.
+    def _set_property_from_frame(self,frame: Frame):
+        '''frame here is a complete init/inpainted frame'''
+        # basic frame-level property
+        self.H = frame.H
+        self.W = frame.W
+        self.dpt = frame.dpt
+        self.intrinsic = frame.intrinsic
+        self.extrinsic = frame.extrinsic
+        # gaussian property -- xyz with train-able pixel-aligned scale and shift
+        self.xyz = self._to_3d()
+        # gaussian property -- HW3 rgb
+        self.rgb = frame.rgb
+        # gaussian property -- HW4 rotation HW3 scale
+        self._coarse_init_scale_rotations()
+        # gaussian property -- HW opacity
+        self.opacity = np.ones((self.H,self.W,1)) * 0.8
+        # to cuda
+        self._paint_filter(frame.inpaint_wo_edge)
+        self._to_cuda()
+        # de-activate
+        self.rgb = self.rgbs_deact(self.rgb)
+        self.scale = self.scales_deact(self.scale)
+        self.opacity = self.opacity_deact(self.opacity)
+        # to torch parameters
+        self.rgb = nn.Parameter(self.rgb,requires_grad=False)
+        self.xyz = nn.Parameter(self.xyz,requires_grad=False)
+        self.scale = nn.Parameter(self.scale,requires_grad=False)
+        self.opacity = nn.Parameter(self.opacity,requires_grad=False)
+        self.rotation = nn.Parameter(self.rotation,requires_grad=False)
+    def _require_grad(self,sign=True):
+        self.rgb = self.rgb.requires_grad_(sign)
+        self.xyz = self.xyz.requires_grad_(sign)
+        self.scale = self.scale.requires_grad_(sign)
+        self.opacity = self.opacity.requires_grad_(sign)
+        self.rotation = self.rotation.requires_grad_(sign)
+class Gaussian_Scene():
+    def __init__(self,cfg=None):
+        # frames initialing the frame
+        self.frames = []
+        self.gaussian_frames: list[Gaussian_Frame] = [] # gaussian frame require training at this optimization
+        # activate fuctions
+        self.rgbs_act    = torch.sigmoid
+        self.scales_act  = torch.exp
+        self.opacity_act = torch.sigmoid
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        # for traj generation
+        self.traj_type   = 'spiral'
+        if cfg is not None:
+            self.traj_min_percentage = cfg.scene.traj.near_percentage
+            self.traj_max_percentage = cfg.scene.traj.far_percentage
+            self.traj_forward_ratio  = cfg.scene.traj.traj_forward_ratio
+            self.traj_backward_ratio = cfg.scene.traj.traj_backward_ratio
+        else:
+            self.traj_min_percentage,self.traj_max_percentage,self.traj_forward_ratio,self.traj_backward_ratio = 5, 50, 0.3, 0.4
+    # basic operations
+    def _render_RGBD(self,frame,background_color='black'):
+        '''
+        :intinsic: tensor of [fu,fv,cu,cv] 4-dimension
+        :extinsic: tensor 4*4-dimension
+        :out: tensor H*W*3-dimension
+        '''
+        background = None
+        if background_color =='white':
+            background = torch.ones(1,4,device=self.device)*0.1
+            background[:,-1] = 0. # for depth
+        # aligned untrainable xyz and unaligned trainable xyz
+        # others
+        xyz       = torch.cat([gf.xyz.reshape(-1,3) for gf in self.gaussian_frames],dim=0)
+        rgb       = torch.cat([gf.rgb.reshape(-1,3) for gf in self.gaussian_frames],dim=0)
+        scale     = torch.cat([gf.scale.reshape(-1,3) for gf in self.gaussian_frames],dim=0)
+        opacity   = torch.cat([gf.opacity.reshape(-1) for gf in self.gaussian_frames],dim=0)
+        rotation  = torch.cat([gf.rotation.reshape(-1,4) for gf in self.gaussian_frames],dim=0)
+        # activate
+        rgb       = self.rgbs_act(rgb)
+        scale     = self.scales_act(scale)
+        rotation  = F.normalize(rotation,dim=1)
+        opacity   = self.opacity_act(opacity)
+        # property
+        H,W = frame.H, frame.W
+        intrinsic = torch.from_numpy(frame.intrinsic.astype(np.float32)).to(self.device)
+        extrinsic = torch.from_numpy(frame.extrinsic.astype(np.float32)).to(self.device)
+        # render
+        render_out,render_alpha,_ = gs.rendering.rasterization(means = xyz,
+                                                scales    = scale,
+                                                quats     = rotation,
+                                                opacities = opacity,
+                                                colors    = rgb,
+                                                Ks        = intrinsic[None],
+                                                viewmats  = extrinsic[None],
+                                                width     = W,
+                                                height    = H,
+                                                packed    = False,
+                                                near_plane= 0.01,
+                                                render_mode="RGB+ED",
+                                                backgrounds=background) # render: 1*H*W*(3+1)
+        render_out  = render_out.squeeze() # result: H*W*(3+1)
+        render_rgb  = render_out[:,:,0:3]
+        render_dpt  = render_out[:,:,-1]
+        return render_rgb, render_dpt, render_alpha
+    @torch.no_grad()
+    def _render_for_inpaint(self,frame):
+        # first render
+        render_rgb, render_dpt, render_alpha = self._render_RGBD(frame)
+        render_msk = alpha_inpaint_mask(render_alpha)
+        # to numpy
+        render_rgb = render_rgb.detach().cpu().numpy()
+        render_dpt = render_dpt.detach().cpu().numpy()
+        render_alpha = render_alpha.detach().cpu().numpy()
+        # assign back
+        frame.rgb = render_rgb
+        frame.dpt = render_dpt
+        frame.inpaint = render_msk
+        return frame
+    def _add_trainable_frame(self,frame:Frame,require_grad=True):
+        # for the init frame, we keep all pixels for finetuning
+        self.frames.append(frame)
+        gf = Gaussian_Frame(frame, self.device)
+        gf._require_grad(require_grad)
+        self.gaussian_frames.append(gf)

ops/gs/sh_utils.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import torch
+C0 = 0.28209479177387814
+C1 = 0.4886025119029199
+C2 = [
+    1.0925484305920792,
+    -1.0925484305920792,
+    0.31539156525252005,
+    -1.0925484305920792,
+    0.5462742152960396
+]
+C3 = [
+    -0.5900435899266435,
+    2.890611442640554,
+    -0.4570457994644658,
+    0.3731763325901154,
+    -0.4570457994644658,
+    1.445305721320277,
+    -0.5900435899266435
+]
+C4 = [
+    2.5033429417967046,
+    -1.7701307697799304,
+    0.9461746957575601,
+    -0.6690465435572892,
+    0.10578554691520431,
+    -0.6690465435572892,
+    0.47308734787878004,
+    -1.7701307697799304,
+    0.6258357354491761,
+]
+def eval_sh(deg, sh, dirs):
+    """
+    Evaluate spherical harmonics at unit directions
+    using hardcoded SH polynomials.
+    Works with torch/np/jnp.
+    ... Can be 0 or more batch dimensions.
+    Args:
+        deg: int SH deg. Currently, 0-3 supported
+        sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
+        dirs: jnp.ndarray unit directions [..., 3]
+    Returns:
+        [..., C]
+    """
+    assert deg <= 4 and deg >= 0
+    coeff = (deg + 1) ** 2
+    assert sh.shape[-1] >= coeff
+    result = C0 * sh[..., 0]
+    if deg > 0:
+        x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
+        result = (result -
+                C1 * y * sh[..., 1] +
+                C1 * z * sh[..., 2] -
+                C1 * x * sh[..., 3])
+        if deg > 1:
+            xx, yy, zz = x * x, y * y, z * z
+            xy, yz, xz = x * y, y * z, x * z
+            result = (result +
+                    C2[0] * xy * sh[..., 4] +
+                    C2[1] * yz * sh[..., 5] +
+                    C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] +
+                    C2[3] * xz * sh[..., 7] +
+                    C2[4] * (xx - yy) * sh[..., 8])
+            if deg > 2:
+                result = (result +
+                C3[0] * y * (3 * xx - yy) * sh[..., 9] +
+                C3[1] * xy * z * sh[..., 10] +
+                C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] +
+                C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] +
+                C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] +
+                C3[5] * z * (xx - yy) * sh[..., 14] +
+                C3[6] * x * (xx - 3 * yy) * sh[..., 15])
+                if deg > 3:
+                    result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] +
+                            C4[1] * yz * (3 * xx - yy) * sh[..., 17] +
+                            C4[2] * xy * (7 * zz - 1) * sh[..., 18] +
+                            C4[3] * yz * (7 * zz - 3) * sh[..., 19] +
+                            C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] +
+                            C4[5] * xz * (7 * zz - 3) * sh[..., 21] +
+                            C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] +
+                            C4[7] * xz * (xx - 3 * yy) * sh[..., 23] +
+                            C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24])
+    return result
+def RGB2SH(rgb):
+    return (rgb - 0.5) / C0
+def SH2RGB(sh):
+    return sh * C0 + 0.5

ops/gs/train.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import cv2
+import tqdm
+import torch
+# import lpips
+import numpy as np
+from ops import utils
+import torch.nn.functional as F
+import torchvision.transforms as tvtf
+from ops.gs.basic import Gaussian_Scene,Frame
+from torchmetrics.image import StructuralSimilarityIndexMeasure
+class RGB_Loss():
+    def __init__(self,w_lpips=0.2,w_ssim=0.2):
+        self.rgb_loss = F.smooth_l1_loss
+        # self.lpips_alex = lpips.LPIPS(net='alex').to('cuda')
+        self.ssim = StructuralSimilarityIndexMeasure(data_range=1.0).to('cuda')
+        self.w_ssim = w_ssim
+        self.w_lpips = w_lpips
+    def __call__(self,pr,gt,valid_mask=None):
+        pr = torch.nan_to_num(pr)
+        gt = torch.nan_to_num(gt)
+        if len(pr.shape) < 3: pr = pr[:,:,None].repeat(1,1,3)
+        if len(gt.shape) < 3: gt = gt[:,:,None].repeat(1,1,3)
+        pr_valid = pr[valid_mask] if valid_mask is not None else pr.reshape(-1,pr.shape[-1])
+        gt_valid = gt[valid_mask] if valid_mask is not None else gt.reshape(-1,gt.shape[-1])
+        l_rgb = self.rgb_loss(pr_valid,gt_valid)
+        l_ssim = 1.0 - self.ssim(pr[None].permute(0, 3, 1, 2), gt[None].permute(0, 3, 1, 2))
+        # l_lpips = self.lpips_alex(pr[None].permute(0, 3, 1, 2), gt[None].permute(0, 3, 1, 2))
+        return l_rgb + self.w_ssim * l_ssim
+class GS_Train_Tool():
+    '''
+    Frames and well-trained gaussians are kept, refine the trainable gaussians
+    The supervision comes from the Frames of GS_Scene
+    '''
+    def __init__(self,
+                 GS:Gaussian_Scene,
+                 iters = 100) -> None:
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        # hyperparameters for prune, densify, and update
+        self.lr_factor = 1.00
+        self.lr_update = 0.99
+        # learning rate
+        self.rgb_lr = 0.0005
+        self.xyz_lr = 0.0001
+        self.scale_lr = 0.005
+        self.opacity_lr = 0.05
+        self.rotation_lr = 0.001
+        # GSs for training
+        self.GS = GS
+        # hyperparameters for training
+        self.iters = iters
+        self._init_optimizer()
+        self.rgb_lossfunc = RGB_Loss(w_lpips=0)
+    def _init_optimizer(self):
+        self.optimize_frames = [gf for gf in self.GS.gaussian_frames if gf.rgb.requires_grad]
+        # following https://github.com/pointrix-project/msplat
+        self.optimizer = torch.optim.Adam([
+            {'params': [gf.xyz for gf in self.optimize_frames],      'lr': self.xyz_lr},
+            {'params': [gf.rgb for gf in self.optimize_frames],      'lr': self.rgb_lr},
+            {'params': [gf.scale for gf in self.optimize_frames],    'lr': self.scale_lr},
+            {'params': [gf.opacity for gf in self.optimize_frames],  'lr': self.opacity_lr},
+            {'params': [gf.rotation for gf in self.optimize_frames], 'lr': self.rotation_lr}
+        ])
+    def _render(self,frame):
+        rgb,dpt,alpha = self.GS._render_RGBD(frame)
+        return rgb,dpt,alpha
+    def _to_cuda(self,tensor):
+        tensor = torch.from_numpy(tensor.astype(np.float32)).to('cuda')
+        return tensor
+    def __call__(self,target_frames=None):
+        target_frames = self.GS.frames if target_frames is None else target_frames
+        for iter in tqdm.tqdm(range(self.iters)):
+            frame_idx = np.random.randint(0,len(target_frames))
+            frame :Frame = target_frames[frame_idx]
+            render_rgb,render_dpt,render_alpha=self._render(frame)
+            loss_rgb = self.rgb_lossfunc(render_rgb,self._to_cuda(frame.rgb),valid_mask=frame.inpaint)
+            # optimization
+            loss = loss_rgb
+            loss.backward()
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+        refined_scene = self.GS
+        for gf in refined_scene.gaussian_frames:
+            gf._require_grad(False)
+        return refined_scene

ops/llava.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import PIL
+import torch
+import numpy as np
+from transformers import AutoProcessor, LlavaForConditionalGeneration
+class Llava():
+    def __init__(self,device='cuda',
+                 llava_ckpt='llava-hf/bakLlava-v1-hf') -> None:
+        self.device = device
+        self.model_id = llava_ckpt
+        self.model = LlavaForConditionalGeneration.from_pretrained(
+            self.model_id,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            ).to(self.device)
+        self.processor = AutoProcessor.from_pretrained(self.model_id)
+    def __call__(self,image:PIL.Image, prompt=None):
+        # input check
+        if not isinstance(image,PIL.Image.Image):
+            if np.amax(image) < 1.1:
+                image = image * 255
+            image = image.astype(np.uint8)
+            image = PIL.Image.fromarray(image)
+        prompt = '<image>\n USER: Detaily imagine and describe the scene this image taken from? \n ASSISTANT: This image is taken from a scene of ' if prompt is None else prompt
+        inputs = self.processor(prompt, image, return_tensors='pt').to(self.model.device,torch.float16)
+        output = self.model.generate(**inputs, max_new_tokens=200, do_sample=False)
+        answer = self.processor.decode(output[0][2:], skip_special_tokens=True)
+        return answer

ops/mcs.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+import numpy as np
+import torchvision.transforms as tvtf
+from tools.StableDiffusion.Hack_SD_stepwise import Hack_SDPipe_Stepwise
+'''
+Input: Multiview images with added noise
+denoise to x0
+denoise from step t1 to step t2
+'''
+class HackSD_MCS():
+    '''
+        transform images to self.latents
+        add noise to self.latents
+        predict step noise --> x0
+        mv RGB-D warp as target image
+        target image encode to latent and get target noise
+        noise rectification
+        step denoise
+    '''
+    def __init__(self,device='cpu',use_lcm=True,denoise_steps=20,
+                 sd_ckpt=f'tools/StableDiffusion/ckpt',
+                 lcm_ckpt=f'latent-consistency/lcm-lora-sdv1-5') -> None:
+        '''
+        ref_rgb should be -1~1 tensor B*3*H*W
+        '''
+        self.device = device
+        self.target_type = np.float32
+        self.use_lcm = use_lcm
+        self.sd_ckpt = sd_ckpt
+        self.lcm_ckpt = lcm_ckpt
+        self._load_model()
+        # define step to add noise and steps to denoise
+        self.denoise_steps = denoise_steps
+        self.timesteps = self.model.timesteps
+    def _load_model(self):
+        self.model = Hack_SDPipe_Stepwise.from_pretrained(self.sd_ckpt)
+        self.model._use_lcm(self.use_lcm,self.lcm_ckpt)
+        self.model.re_init(num_inference_steps=50)
+        try:
+            self.model.enable_xformers_memory_efficient_attention()
+        except:
+            pass  # run without xformers
+        self.model = self.model.to(self.device)
+    def to(self, device):
+        self.device = device
+        self.model.to(device)
+    @ torch.no_grad()
+    def _add_noise_to_latent(self,latents):
+        bsz = latents.shape[0]
+        # in the Stable Diffusion, the iterations numbers is 1000 for adding the noise and denosing.
+        timestep = self.timesteps[-self.denoise_steps]
+        timestep = timestep.repeat(bsz).to(self.device)
+        # target noise
+        noise = torch.randn_like(latents)
+        # add noise
+        noisy_latent = self.model.scheduler.add_noise(latents, noise, timestep)
+        # -------------------- noise for supervision -----------------
+        if self.model.scheduler.config.prediction_type == "epsilon":
+            target = noise
+        elif self.model.scheduler.config.prediction_type == "v_prediction":
+            target = self.model.scheduler.get_velocity(latents, noise, timestep)
+        return noisy_latent, timestep, target
+    @ torch.no_grad()
+    def _encode_mv_init_images(self, images):
+        '''
+        images should be B3HW
+        '''
+        images = images * 2 - 1
+        self.latents = self.model._encode(images)
+        self.latents,_,_ = self._add_noise_to_latent(self.latents)
+    @ torch.no_grad()
+    def _sd_forward(self, denoise_step, prompt_latent:torch.Tensor):
+        # temp noise prediction
+        t = self.timesteps[[-self.denoise_steps+denoise_step]].to(self.device)
+        noise_pred = self.model._step_noise(self.latents, t, prompt_latent.repeat(len(self.latents),1,1))
+        # solve image
+        _,x0 = self.model._solve_x0(self.latents,noise_pred,t)
+        x0 = (x0 + 1) / 2 # in 0-1
+        return t, noise_pred, x0
+    @ torch.no_grad()
+    def _denoise_to_x0(self, timestep_in_1000, prompt_latent:torch.Tensor):
+        # temp noise prediction
+        noise_pred = self.model._step_noise(self.latents, timestep_in_1000, prompt_latent.repeat(len(self.latents),1,1))
+        # solve image
+        _,x0 = self.model._solve_x0(self.latents,noise_pred,timestep_in_1000)
+        x0 = (x0 + 1) / 2 # in 0-1
+        return noise_pred, x0
+    @ torch.no_grad()
+    def _step_denoise(self, t, pred_noise, rect_x0, rect_w = 0.7):
+        '''
+        pred_noise B4H//8W//8
+        x0, rect_x0 B3HW
+        '''
+        # encoder rect_x0 to latent
+        rect_x0 = rect_x0 * 2 - 1
+        rect_latent = self.model._encode(rect_x0)
+        # rectified noise
+        rect_noise = self.model._solve_noise_given_x0_latent(self.latents,rect_latent,t)
+        # noise rectification
+        rect_noise = rect_noise / rect_noise.std(dim=list(range(1, rect_noise.ndim)),keepdim=True) \
+                                * pred_noise.std(dim=list(range(1, pred_noise.ndim)),keepdim=True)
+        pred_noise = pred_noise*(1.-rect_w) + rect_noise*rect_w
+        # step forward
+        self.latents = self.model._step_denoise(self.latents,pred_noise,t)
+    @ torch.no_grad()
+    def _decode_mv_imgs(self):
+        imgs = self.model._decode(self.latents)
+        imgs = (imgs + 1) / 2
+        return imgs

ops/sky.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+import numpy as np
+from transformers import OneFormerProcessor, OneFormerForUniversalSegmentation
+class Sky_Seg_Tool():
+    def __init__(self,cfg):
+        self.processor = OneFormerProcessor.from_pretrained("shi-labs/oneformer_ade20k_swin_large")
+        self.model = OneFormerForUniversalSegmentation.from_pretrained("shi-labs/oneformer_ade20k_swin_large")
+    def __call__(self, img):
+        '''
+        input rgb should be numpy in range of 0-1 or 0-255
+        '''
+        # Semantic Segmentation
+        if np.amax(img) < 2: img = img*255
+        inputs = self.processor(images=img, task_inputs=["semantic"], return_tensors="pt")
+        outputs = self.model(**inputs)
+        # pass through image_processor for postprocessing
+        predicted_semantic_map = self.processor.post_process_semantic_segmentation(outputs, target_sizes=[img.size[::-1]])[0]
+        sky_msk = predicted_semantic_map.numpy() == 2
+        return sky_msk

ops/trajs/__init__.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import numpy as np
+from ops.sky import Sky_Seg_Tool
+from ops.utils import dpt2xyz
+from .spiral import spiral_camera_poses
+class Trajectory_Generation():
+    def __init__(self,
+                 scene = None,
+                 method = 'spiral') -> None:
+        '''
+        method = 'spiral'/ rot' / 'spin'
+        '''
+        self._method = method
+        self.forward_ratio  = scene.traj_forward_ratio
+        self.backward_ratio = scene.traj_backward_ratio
+        self.min_percentage = scene.traj_min_percentage
+        self.max_percentage = scene.traj_max_percentage
+    def _radius(self, xyz):
+        # get range
+        _min = np.percentile(xyz,self.min_percentage,axis=0)
+        _max = np.percentile(xyz,self.max_percentage,axis=0)
+        _range = _max - _min
+        # set radius to mean range of three axes
+        self.radius = np.mean(_range)
+    def _traj_spiral(self, nframe):
+        trajs = spiral_camera_poses(nframe, self.radius, self.forward_ratio, self.backward_ratio)
+        return trajs
+    def __call__(self, nframe, xyz):
+        if xyz.ndim > 2:
+            xyz = xyz.reshape(-1,3)
+        self._radius(xyz)
+        if self._method == 'rot':
+            trajs = self._traj_rot(nframe)
+        elif self._method == 'spin':
+            trajs = self._traj_spin(nframe)
+        elif self._method == 'spiral':
+            trajs = self._traj_spiral(nframe)
+        else:
+            raise TypeError('method = rot / spiral')
+        return trajs
+def _generate_trajectory(cfg, scene, nframes=None):
+    method = scene.traj_type
+    nframe = cfg.scene.traj.n_sample*6 if nframes is None else nframes
+    sky,dpt,intrinsic = scene.frames[0].sky,scene.frames[0].dpt,scene.frames[0].intrinsic
+    xyz = dpt2xyz(dpt,intrinsic)
+    init_xyz = xyz[~sky]
+    generator = Trajectory_Generation(scene=scene,method=method)
+    traj = generator(nframe,init_xyz)
+    return traj

ops/trajs/spiral.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+import numpy as np
+def generate_spiral_trajectory(num_frames, radius, forward_ratio=0.2, backward_ratio=0.8):
+    t = np.linspace(0, 1, num_frames)
+    r = np.sin(2 * np.pi * t) * radius
+    # rotation angles at each frame
+    theta = 2 * np.pi * t * num_frames
+    # try not to change y (up-down for floor and sky)
+    x = r * np.cos(theta)
+    y = r * np.sin(theta) * 0.3
+    z = -r
+    z[z<0]*=forward_ratio
+    z[z>0]*=backward_ratio
+    return x, y, z
+def look_at(camera_position, target_position):
+    # look at direction
+    direction = target_position - camera_position
+    direction /= np.linalg.norm(direction)
+    # calculate rotation matrix
+    up = np.array([0, 1, 0])
+    right = np.cross(up, direction)
+    right /= np.linalg.norm(right)
+    up = np.cross(direction, right)
+    rotation_matrix = np.vstack([right, up, direction])
+    rotation_matrix = np.linalg.inv(rotation_matrix)
+    return rotation_matrix
+def spiral_camera_poses(num_frames, radius, forward_ratio = 0.2, backward_ratio = 0.8, rotation_times = 0.3, look_at_times = 0.5):
+    x, y, z = generate_spiral_trajectory(num_frames, radius*rotation_times, forward_ratio, backward_ratio)
+    target_position = np.array([0,0,radius*look_at_times])
+    camera_positions = np.vstack([x, y, z]).T
+    camera_poses = []
+    for pos in camera_positions:
+        rotation_matrix = look_at(pos, target_position)
+        transform_matrix = np.eye(4)
+        transform_matrix[:3, :3] = rotation_matrix
+        transform_matrix[:3,  3] = pos
+        camera_poses.append(transform_matrix[None])
+    camera_poses.reverse()
+    camera_poses = np.concatenate(camera_poses,axis=0)
+    return camera_poses

ops/utils.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import os
+import cv2
+import torch
+import matplotlib
+import numpy as np
+import open3d as o3d
+from PIL import Image
+from copy import deepcopy
+from omegaconf import OmegaConf
+from scipy.spatial import cKDTree
+def gen_config(cfg_path):
+    return OmegaConf.load(cfg_path)
+def get_focal_from_fov(new_fov, H, W):
+    # NOTE: top-left pixel should be (0,0)
+    if W >= H:
+        f = (W / 2.0) / np.tan(np.deg2rad(new_fov / 2.0))
+    else:
+        f = (H / 2.0) / np.tan(np.deg2rad(new_fov / 2.0))
+    return f
+def get_intrins_from_fov(new_fov, H, W):
+    # NOTE: top-left pixel should be (0,0)
+    f = get_focal_from_fov(new_fov,H,W)
+    new_cu = (W / 2.0) - 0.5
+    new_cv = (H / 2.0) - 0.5
+    new_intrins = np.array([
+        [f,         0,     new_cu  ],
+        [0,         f,     new_cv  ],
+        [0,         0,     1       ]
+    ])
+    return new_intrins
+def dpt2xyz(dpt,intrinsic):
+    # get grid
+    height, width = dpt.shape[0:2]
+    grid_u = np.arange(width)[None,:].repeat(height,axis=0)
+    grid_v = np.arange(height)[:,None].repeat(width,axis=1)
+    grid = np.concatenate([grid_u[:,:,None],grid_v[:,:,None],np.ones_like(grid_v)[:,:,None]],axis=-1)
+    uvz = grid * dpt[:,:,None]
+    # inv intrinsic
+    inv_intrinsic = np.linalg.inv(intrinsic)
+    xyz = np.einsum(f'ab,hwb->hwa',inv_intrinsic,uvz)
+    return xyz
+def dpt2xyz_torch(dpt,intrinsic):
+    # get grid
+    height, width = dpt.shape[0:2]
+    grid_u = torch.arange(width)[None,:].repeat(height,1)
+    grid_v = torch.arange(height)[:,None].repeat(1,width)
+    grid = torch.concatenate([grid_u[:,:,None],grid_v[:,:,None],torch.ones_like(grid_v)[:,:,None]],axis=-1).to(dpt)
+    uvz = grid * dpt[:,:,None]
+    # inv intrinsic
+    inv_intrinsic = torch.linalg.inv(intrinsic)
+    xyz = torch.einsum(f'ab,hwb->hwa',inv_intrinsic,uvz)
+    return xyz
+def visual_pcd(xyz, color=None, normal = True):
+    if hasattr(xyz,'ndim'):
+        xyz_norm = np.mean(np.sqrt(np.sum(np.square(xyz),axis=1)))
+        xyz = xyz / xyz_norm
+        xyz = xyz.reshape(-1,3)
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(xyz)
+    else: pcd = xyz
+    if color is not None:
+        color = color.reshape(-1,3)
+        pcd.colors = o3d.utility.Vector3dVector(color)
+    if normal:
+        pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(0.2, 20))
+    o3d.visualization.draw_geometries([pcd])
+def visual_pcds(xyzs, normal = True):
+    pcds = []
+    for xyz in xyzs:
+        if hasattr(xyz,'ndim'):
+            # xyz_norm = np.mean(np.sqrt(np.sum(np.square(xyz),axis=1)))
+            # xyz = xyz / xyz_norm
+            xyz = xyz.reshape(-1,3)
+            pcd = o3d.geometry.PointCloud()
+            pcd.points = o3d.utility.Vector3dVector(xyz)
+            pcd.paint_uniform_color(np.random.rand(3))
+        else: pcd = xyz
+        if normal:
+            pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(0.2, 20))
+        pcds.append(pcd)
+    o3d.visualization.draw_geometries(pcds)
+def save_pic(input_pic:np.array,save_fn,normalize=True):
+    # avoid replace
+    pic = deepcopy(input_pic).astype(np.float32)
+    pic = np.nan_to_num(pic)
+    if normalize:
+        vmin = np.percentile(pic, 2)
+        vmax = np.percentile(pic, 98)
+        pic = (pic - vmin) / (vmax - vmin)
+    pic = (pic * 255.0).clip(0, 255)
+    if save_fn is not None:
+        pic_save = Image.fromarray(pic.astype(np.uint8))
+        pic_save.save(save_fn)
+    return pic
+def depth_colorize(dpt,sky_mask=None):
+    cm = matplotlib.colormaps["Spectral"]
+    depth = dpt_normalize(dpt,sky_mask)
+    img_colored_np = cm(depth, bytes=False)[:, :, 0:3]  # value from 0 to 1
+    return img_colored_np
+def dpt_normalize(dpt, sky_mask = None):
+    if sky_mask is not None:
+        pic = dpt[~sky_mask]
+    else:
+        pic = dpt
+    vmin = np.percentile(pic, 2)
+    vmax = np.percentile(pic, 98)
+    dpt = (deepcopy(dpt) - vmin) / (vmax - vmin)
+    if sky_mask is not None:
+        dpt[sky_mask] = 1.
+    return dpt
+def transform_points(pts,transform):
+    h,w=transform.shape
+    if h==3 and w==3:
+        return pts @ transform.T
+    if h==3 and w==4:
+        return pts @ transform[:,:3].T + transform[:,3:].T
+    elif h==4 and w==4:
+        return pts @ transform[0:3,:3].T + transform[0:3,3:].T
+    else: raise NotImplementedError
+def get_nml_from_quant(quant):
+    '''
+    input N*4
+    outut N*3
+    follow https://arxiv.org/pdf/2404.17774
+    '''
+    w=quant[:,0]
+    x=quant[:,1]
+    y=quant[:,2]
+    z=quant[:,3]
+    n0 = 2*x*z+2*y*w
+    n1 = 2*y*z-2*x*w
+    n2 = 1-2*x*x-2*y*y
+    nml = torch.cat((n0[:,None],n1[:,None],n2[:,None]),dim=1)
+    return nml
+def quaternion_from_matrix(M):
+    m00 = M[..., 0, 0]
+    m01 = M[..., 0, 1]
+    m02 = M[..., 0, 2]
+    m10 = M[..., 1, 0]
+    m11 = M[..., 1, 1]
+    m12 = M[..., 1, 2]
+    m20 = M[..., 2, 0]
+    m21 = M[..., 2, 1]
+    m22 = M[..., 2, 2]
+    K = torch.zeros((len(M),4,4)).to(M)
+    K[:,0,0] = m00 - m11 - m22
+    K[:,1,0] = m01 + m10
+    K[:,1,1] = m11 - m00 - m22
+    K[:,2,0] = m02 + m20
+    K[:,2,1] = m12 + m21
+    K[:,2,2] = m22 - m00 - m11
+    K[:,3,0] = m21 - m12
+    K[:,3,1] = m02 - m20
+    K[:,3,2] = m10 - m01
+    K[:,3,3] = m00 + m11 + m22
+    K = K/3
+    # quaternion is eigenvector of K that corresponds to largest eigenvalue
+    w, V = torch.linalg.eigh(K)
+    q = V[torch.arange(len(V)),:,torch.argmax(w,dim=1)]
+    q = q[:,[3, 0, 1, 2]]
+    for i in range(len(q)):
+        if q[i,0]<0.:
+            q[i] = -q[i]
+    return q
+def numpy_quaternion_from_matrix(M):
+    H,W = M.shape[0:2]
+    M = M.reshape(-1,3,3)
+    m00 = M[..., 0, 0]
+    m01 = M[..., 0, 1]
+    m02 = M[..., 0, 2]
+    m10 = M[..., 1, 0]
+    m11 = M[..., 1, 1]
+    m12 = M[..., 1, 2]
+    m20 = M[..., 2, 0]
+    m21 = M[..., 2, 1]
+    m22 = M[..., 2, 2]
+    K = np.zeros((len(M),4,4))
+    K[...,0,0] = m00 - m11 - m22
+    K[...,1,0] = m01 + m10
+    K[...,1,1] = m11 - m00 - m22
+    K[...,2,0] = m02 + m20
+    K[...,2,1] = m12 + m21
+    K[...,2,2] = m22 - m00 - m11
+    K[...,3,0] = m21 - m12
+    K[...,3,1] = m02 - m20
+    K[...,3,2] = m10 - m01
+    K[...,3,3] = m00 + m11 + m22
+    K = K/3
+    # quaternion is eigenvector of K that corresponds to largest eigenvalue
+    w, V = np.linalg.eigh(K)
+    q = V[np.arange(len(V)),:,np.argmax(w,axis=1)]
+    q = q[...,[3, 0, 1, 2]]
+    for i in range(len(q)):
+        if q[i,0]<0.:
+            q[i] = -q[i]
+    q = q.reshape(H,W,4)
+    return q
+def numpy_normalize(input):
+    input = input / (np.sqrt(np.sum(np.square(input),axis=-1,keepdims=True))+1e-5)
+    return input
+class suppress_stdout_stderr(object):
+    '''
+    Avoid terminal output of diffusion processings!
+    A context manager for doing a "deep suppression" of stdout and stderr in
+    Python, i.e. will suppress all print, even if the print originates in a
+    compiled C/Fortran sub-function.
+       This will not suppress raised exceptions, since exceptions are printed
+    to stderr just before a script exits, and after the context manager has
+    exited (at least, I think that is why it lets exceptions through).
+    '''
+    def __init__(self):
+        # Open a pair of null files
+        self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
+        # Save the actual stdout (1) and stderr (2) file descriptors.
+        self.save_fds = (os.dup(1), os.dup(2))
+    def __enter__(self):
+        # Assign the null pointers to stdout and stderr.
+        os.dup2(self.null_fds[0], 1)
+        os.dup2(self.null_fds[1], 2)
+    def __exit__(self, *_):
+        # Re-assign the real stdout/stderr back to (1) and (2)
+        os.dup2(self.save_fds[0], 1)
+        os.dup2(self.save_fds[1], 2)
+        # Close the null files
+        os.close(self.null_fds[0])
+        os.close(self.null_fds[1])
+import torch.nn.functional as F
+def nei_delta(input,pad=2):
+    if not type(input) is torch.Tensor:
+        input = torch.from_numpy(input.astype(np.float32))
+    if len(input.shape) < 3:
+        input = input[:,:,None]
+    h,w,c = input.shape
+    # reshape
+    input = input.permute(2,0,1)[None]
+    input = F.pad(input, pad=(pad,pad,pad,pad), mode='replicate')
+    kernel = 2*pad + 1
+    input = F.unfold(input,[kernel,kernel],padding=0)
+    input = input.reshape(c,-1,h,w).permute(2,3,0,1).squeeze() # hw(3)*25
+    return torch.amax(input,dim=-1),torch.amin(input,dim=-1),input
+def inpaint_mask(render_dpt,render_rgb):
+    # edge filter delta thres
+    valid_dpt = render_dpt[render_dpt>1e-3]
+    valid_dpt = torch.sort(valid_dpt).values
+    max = valid_dpt[int(.85*len(valid_dpt))]
+    min = valid_dpt[int(.15*len(valid_dpt))]
+    ths = (max-min) * 0.2
+    # nei check
+    nei_max, nei_min, _ = nei_delta(render_dpt,pad=1)
+    edge_mask = (nei_max - nei_min) > ths
+    # render hole
+    hole_mask = render_dpt < 1e-3
+    # whole mask -- original noise and sparse
+    mask = edge_mask | hole_mask
+    mask = mask.cpu().float().numpy()
+    # modify rgb sightly for small holes : blur and sharpen
+    render_rgb       = render_rgb.detach().cpu().numpy()
+    render_rgb       = (render_rgb*255).astype(np.uint8)
+    render_rgb_blur  = cv2.medianBlur(render_rgb,5)
+    render_rgb[mask>.5] = render_rgb_blur[mask>.5]  # blur and replace small holes
+    render_rgb       = torch.from_numpy((render_rgb/255).astype(np.float32)).to(render_dpt)
+    # slightly clean mask
+    kernel = np.ones((5,5),np.uint8)
+    mask = cv2.erode(mask,kernel,iterations=2)
+    mask = cv2.dilate(mask,kernel,iterations=7)
+    mask = mask > 0.5
+    return mask,render_rgb
+def alpha_inpaint_mask(render_alpha):
+    render_alpha = render_alpha.detach().squeeze().cpu().numpy()
+    paint_mask = 1.-np.around(render_alpha)
+    # slightly clean mask
+    kernel = np.ones((5,5),np.uint8)
+    paint_mask = cv2.erode(paint_mask,kernel,iterations=1)
+    paint_mask = cv2.dilate(paint_mask,kernel,iterations=3)
+    paint_mask = paint_mask > 0.5
+    return paint_mask
+def edge_filter(metric_dpt,sky=None,times=0.1):
+    sky = np.zeros_like(metric_dpt,bool) if sky is None else sky
+    _max = np.percentile(metric_dpt[~sky],95)
+    _min = np.percentile(metric_dpt[~sky], 5)
+    _range = _max - _min
+    nei_max,nei_min,_ = nei_delta(metric_dpt)
+    delta = (nei_max-nei_min).numpy()
+    edge = delta > times*_range
+    return edge
+def fill_mask_with_nearest(imgs, mask):
+    # mask and un-mask pixel coors
+    mask_coords = np.column_stack(np.where(mask > .5))
+    non_mask_coords = np.column_stack(np.where(mask < .5))
+    # kd-tree on un-masked pixels
+    tree = cKDTree(non_mask_coords)
+    # nn search of masked pixels
+    _, idxs = tree.query(mask_coords)
+    # replace and fill
+    for i, coord in enumerate(mask_coords):
+        nearest_coord = non_mask_coords[idxs[i]]
+        for img in imgs:
+            img[coord[0], coord[1]] = img[nearest_coord[0], nearest_coord[1]]
+    return imgs
+def edge_rectify(metric_dpt,rgb,sky=None):
+    edge = edge_filter(metric_dpt,sky)
+    process_rgb = deepcopy(rgb)
+    metric_dpt,process_rgb = fill_mask_with_nearest([metric_dpt,process_rgb],edge)
+    return metric_dpt,process_rgb
+from plyfile import PlyData, PlyElement
+def color2feat(color):
+    max_sh_degree = 3
+    fused_color = (color-0.5)/0.28209479177387814
+    features = np.zeros((fused_color.shape[0], 3, (max_sh_degree + 1) ** 2))
+    features = torch.from_numpy(features.astype(np.float32))
+    features[:, :3, 0 ] = fused_color
+    features[:, 3:, 1:] = 0.0
+    features_dc   = features[:,:,0:1]
+    features_rest = features[:,:,1: ]
+    return features_dc,features_rest
+def construct_list_of_attributes(features_dc,features_rest,scale,rotation):
+    l = ['x', 'y', 'z', 'nx', 'ny', 'nz']
+    # All channels except the 3 DC
+    for i in range(features_dc.shape[1]*features_dc.shape[2]):
+        l.append('f_dc_{}'.format(i))
+    for i in range(features_rest.shape[1]*features_rest.shape[2]):
+        l.append('f_rest_{}'.format(i))
+    l.append('opacity')
+    for i in range(scale.shape[1]):
+        l.append('scale_{}'.format(i))
+    for i in range(rotation.shape[1]):
+        l.append('rot_{}'.format(i))
+    return l
+def save_ply(scene,path):
+    xyz       = torch.cat([gf.xyz.reshape(-1,3) for gf in scene.gaussian_frames],dim=0).detach().cpu().numpy()
+    scale     = torch.cat([gf.scale.reshape(-1,3) for gf in scene.gaussian_frames],dim=0).detach().cpu().numpy()
+    opacities = torch.cat([gf.opacity.reshape(-1) for gf in scene.gaussian_frames],dim=0)[:,None].detach().cpu().numpy()
+    rotation  = torch.cat([gf.rotation.reshape(-1,4) for gf in scene.gaussian_frames],dim=0).detach().cpu().numpy()
+    rgb       = torch.sigmoid(torch.cat([gf.rgb.reshape(-1,3) for gf in scene.gaussian_frames],dim=0))
+    # rgb
+    features_dc, features_rest = color2feat(rgb)
+    f_dc = features_dc.flatten(start_dim=1).detach().cpu().numpy()
+    f_rest = features_rest.flatten(start_dim=1).detach().cpu().numpy()
+    normals = np.zeros_like(xyz)
+    # save
+    dtype_full = [(attribute, 'f4') for attribute in construct_list_of_attributes(features_dc,features_rest,scale,rotation)]
+    elements = np.empty(xyz.shape[0], dtype=dtype_full)
+    attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1)
+    elements[:] = list(map(tuple, attributes))
+    el = PlyElement.describe(elements, 'vertex')
+    PlyData([el]).write(path)

ops/visual_check.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import imageio
+import matplotlib
+from ops.utils import *
+from ops.gs.basic import *
+from ops.trajs import _generate_trajectory
+class Check():
+    def __init__(self) -> None:
+        pass
+    def _visual_pcd(self,scene:Gaussian_Scene):
+        xyzs,rgbs = [],[]
+        for i,gf in enumerate(scene.gaussian_frames):
+            xyz = gf.xyz.detach().cpu().numpy()
+            rgb = torch.sigmoid(gf.rgb).detach().cpu().numpy()
+            opacity = gf.opacity.detach().squeeze().cpu().numpy() > 1e-5
+            xyzs.append(xyz[opacity])
+            rgbs.append(rgb[opacity])
+        xyzs = np.concatenate(xyzs,axis=0)
+        rgbs = np.concatenate(rgbs,axis=0)
+        visual_pcd(xyzs,color=rgbs,normal=True)
+    @torch.no_grad()
+    def _render_video(self,scene:Gaussian_Scene,save_dir='./'):
+        # render 5times frames
+        nframes = len(scene.frames)*25
+        video_trajs = _generate_trajectory(None,scene,nframes=nframes)
+        H,W,intrinsic = scene.frames[0].H,scene.frames[0].W,deepcopy(scene.frames[0].intrinsic)
+        if H<W:
+            if H>512:
+                ratio = 512/H
+                W,H = int(W*ratio),int(H*ratio)
+                intrinsic[0:2] = intrinsic[0:2]*ratio
+        else:
+            if W>512:
+                ratio = 512/W
+                W,H = int(W*ratio),int(H*ratio)
+                intrinsic[0:2] = intrinsic[0:2]*ratio
+        # render
+        rgbs,dpts = [],[]
+        print(f'[INFO] rendering final video with {nframes} frames...')
+        for pose in video_trajs:
+            frame = Frame(H=H,W=W,
+                          intrinsic=intrinsic,
+                          extrinsic=pose)
+            rgb,dpt,alpha = scene._render_RGBD(frame)
+            rgb = rgb.detach().float().cpu().numpy()
+            dpt = dpt.detach().float().cpu().numpy()
+            dpts.append(dpt)
+            rgbs.append((rgb * 255).astype(np.uint8))
+        rgbs = np.stack(rgbs, axis=0)
+        dpts = np.stack(dpts, axis=0)
+        valid_dpts = dpts[dpts>0.]
+        _min = np.percentile(valid_dpts, 1)
+        _max = np.percentile(valid_dpts,99)
+        dpts = (dpts-_min) / (_max-_min)
+        dpts = dpts.clip(0,1)
+        cm = matplotlib.colormaps["plasma"]
+        dpts_color = cm(dpts,bytes=False)[...,0:3]
+        dpts_color = (dpts_color*255).astype(np.uint8)
+        imageio.mimwrite(f'{save_dir}video_rgb.mp4',rgbs,fps=20)
+        imageio.mimwrite(f'{save_dir}video_dpt.mp4',dpts_color,fps=20)

pipe/__init__.py ADDED Viewed

File without changes

pipe/c2f_recons.py ADDED Viewed

	@@ -0,0 +1,211 @@

+'''
+render using frames in GS
+inpaint with fooocus
+'''
+import os
+import torch
+import numpy as np
+from PIL import Image
+from copy import deepcopy
+from ops.utils import *
+from ops.sky import Sky_Seg_Tool
+from ops.visual_check import Check
+from ops.gs.train import GS_Train_Tool
+from pipe.lvm_inpaint import Inpaint_Tool
+from pipe.reconstruct import Reconstruct_Tool
+from ops.trajs import _generate_trajectory
+from ops.connect import Occlusion_Removal
+from ops.gs.basic import Frame,Gaussian_Scene
+from ops.mcs import HackSD_MCS
+from pipe.refine_mvdps import Refinement_Tool_MCS
+class Pipeline():
+    def __init__(self,cfg) -> None:
+        self.device = 'cuda'
+        self.cfg = cfg
+        self.sky_value = cfg.model.sky.value
+        self.sky_segor = Sky_Seg_Tool(cfg)
+        self.rgb_inpaintor = Inpaint_Tool(cfg)
+        self.reconstructor = Reconstruct_Tool(cfg)
+        # temp
+        self.removalor = Occlusion_Removal()
+        self.checkor = Check()
+    def _mkdir(self,dir):
+        if not os.path.exists(dir):
+            os.makedirs(dir)
+    def _resize_input(self,fn):
+        resize_long_edge = int(self.cfg.scene.input.resize_long_edge)
+        print(f'[Preprocess...] Resize the long edge of input image to {resize_long_edge}.')
+        spl = str.rfind(fn,'.')
+        backup_fn = fn[:spl] + '.original' + fn[spl:]
+        rgb = Image.open(fn)
+        rgb.save(backup_fn) # back up original image
+        rgb = np.array(rgb)[:,:,:3]/255.
+        H,W = rgb.shape[0:2]
+        if H>W:
+            W = int(W*resize_long_edge/H)
+            H = resize_long_edge
+        else:
+            H = int(H*resize_long_edge/W)
+            W = resize_long_edge
+        rgb = cv2.resize(rgb,(W,H))
+        pic = (rgb * 255.0).clip(0, 255)
+        pic_save = Image.fromarray(pic.astype(np.uint8))
+        pic_save.save(fn)
+    def _initialization(self,rgb):
+        rgb = np.array(rgb)[:,:,:3]
+        # conduct outpainting on rgb and change cu,cv
+        outpaint_frame :Frame = self.rgb_inpaintor(Frame(rgb=rgb),
+                                                   outpaint_selections=self.outpaint_selections,
+                                                   outpaint_extend_times=self.outpaint_extend_times)
+        # conduct reconstruction on outpaint results
+        _,intrinsic,_ = self.reconstructor._ProDpt_(rgb) # estimate focal on input view
+        metric_dpt,intrinsic,edge_msk = self.reconstructor._ProDpt_(outpaint_frame.rgb)
+        outpaint_frame.intrinsic = deepcopy(intrinsic)
+        # split to input and outpaint areas
+        input_frame = Frame(H=rgb.shape[0],
+                            W=rgb.shape[1],
+                            rgb=rgb,
+                            intrinsic=deepcopy(intrinsic),
+                            extrinsic=np.eye(4))
+        input_frame.intrinsic[0,-1] = input_frame.W/2.
+        input_frame.intrinsic[1,-1] = input_frame.H/2.
+        # others
+        input_area = ~outpaint_frame.inpaint
+        input_edg = edge_msk[input_area].reshape(input_frame.H,input_frame.W)
+        input_dpt = metric_dpt[input_area].reshape(input_frame.H,input_frame.W)
+        sky = self.sky_segor(input_frame.rgb)
+        input_frame.sky = sky
+        input_dpt[sky] = self.sky_value
+        input_frame.dpt = input_dpt
+        input_frame.inpaint = np.ones_like(input_edg,bool) & (~sky)
+        input_frame.inpaint_wo_edge = (~input_edg) & (~sky)
+        input_frame.ideal_dpt = deepcopy(input_dpt)
+        input_frame.prompt = outpaint_frame.prompt
+        # outpaint frame
+        sky = self.sky_segor(outpaint_frame.rgb)
+        outpaint_frame.sky = sky
+        metric_dpt[sky] = self.sky_value
+        outpaint_frame.dpt = metric_dpt
+        outpaint_frame.ideal_dpt = deepcopy(metric_dpt)
+        outpaint_frame.inpaint = (outpaint_frame.inpaint)&(~sky)
+        outpaint_frame.inpaint_wo_edge = (outpaint_frame.inpaint)&(~edge_msk)
+        # add init frame
+        self.scene._add_trainable_frame(input_frame,require_grad=True)
+        self.scene._add_trainable_frame(outpaint_frame,require_grad=True)
+        self.scene = GS_Train_Tool(self.scene,iters=100)(self.scene.frames)
+    def _generate_traj(self):
+        self.dense_trajs = _generate_trajectory(self.cfg,self.scene)
+    def _pose_to_frame(self,extrinsic,margin=32):
+        H = self.scene.frames[0].H + margin
+        W = self.scene.frames[0].W + margin
+        prompt = self.scene.frames[-1].prompt
+        intrinsic = deepcopy(self.scene.frames[0].intrinsic)
+        intrinsic[0,-1], intrinsic[1,-1] = W/2, H/2
+        frame = Frame(H=H,W=W,intrinsic=intrinsic,extrinsic=extrinsic,prompt=prompt)
+        frame = self.scene._render_for_inpaint(frame)
+        return frame
+    def _next_frame(self,margin=32):
+        # select the frame with largest holes but less than 60%
+        inpaint_area_ratio = []
+        for pose in self.dense_trajs:
+            temp_frame = self._pose_to_frame(pose,margin)
+            inpaint_mask = temp_frame.inpaint
+            inpaint_area_ratio.append(np.mean(inpaint_mask))
+        inpaint_area_ratio = np.array(inpaint_area_ratio)
+        inpaint_area_ratio[inpaint_area_ratio > 0.6] = 0.
+        # remove adjustancy frames
+        for s in self.select_frames:
+            inpaint_area_ratio[s] = 0.
+            if s-1>-1:
+                inpaint_area_ratio[s-1] = 0.
+            if s+1<len(self.dense_trajs):
+                inpaint_area_ratio[s+1] = 0.
+        # select the largest ones
+        select = np.argmax(inpaint_area_ratio)
+        if inpaint_area_ratio[select] < 0.0001: return None
+        self.select_frames.append(select)
+        pose = self.dense_trajs[select]
+        frame = self._pose_to_frame(pose,margin)
+        return frame
+    def _inpaint_next_frame(self,margin=32):
+        frame = self._next_frame(margin)
+        if frame is None: return None
+        # inpaint rgb
+        frame = self.rgb_inpaintor(frame)
+        # inpaint dpt
+        connect_dpt,metric_dpt,_,edge_msk = self.reconstructor._Guide_ProDpt_(frame.rgb,frame.intrinsic,frame.dpt,~frame.inpaint)
+        frame.dpt = connect_dpt
+        frame = self.removalor(self.scene,frame)
+        sky = self.sky_segor(frame.rgb)
+        frame.sky = sky
+        frame.dpt[sky] = self.sky_value
+        frame.inpaint = (frame.inpaint) & (~sky)
+        frame.inpaint_wo_edge = (frame.inpaint) & (~edge_msk)
+        # determine target depth and normal
+        frame.ideal_dpt = metric_dpt
+        self.scene._add_trainable_frame(frame)
+        return 0
+    def _coarse_scene(self,rgb):
+        self._initialization(rgb)
+        self._generate_traj()
+        self.select_frames = []
+        for i in range(self.n_sample-2):
+            print(f'Procecssing {i+2}/{self.n_sample} frame...')
+            sign = self._inpaint_next_frame()
+            if sign is None: break
+            self.scene = GS_Train_Tool(self.scene,iters=self.opt_iters_per_frame)(self.scene.frames)
+    def _MCS_Refinement(self):
+        refiner = HackSD_MCS(device='cuda',use_lcm=True,denoise_steps=self.mcs_iterations,
+                             sd_ckpt=self.cfg.model.optimize.sd,
+                             lcm_ckpt=self.cfg.model.optimize.lcm)
+        self.MVDPS = Refinement_Tool_MCS(self.scene,device='cuda',
+                                         refiner=refiner,
+                                         traj_type=self.traj_type,
+                                         n_view=self.mcs_n_view,
+                                         rect_w=self.mcs_rect_w,
+                                         n_gsopt_iters=self.mcs_gsopt_per_frame)
+        self.scene = self.MVDPS()
+        refiner.to('cpu')
+    def __call__(self):
+        rgb_fn = self.cfg.scene.input.rgb
+        # coarse
+        self.scene = Gaussian_Scene(self.cfg)
+        # for trajectory genearation
+        self.n_sample = self.cfg.scene.traj.n_sample
+        self.traj_type = self.cfg.scene.traj.traj_type
+        self.scene.traj_type = self.cfg.scene.traj.traj_type
+        # for scene generation
+        self.opt_iters_per_frame = self.cfg.scene.gaussian.opt_iters_per_frame
+        self.outpaint_selections = self.cfg.scene.outpaint.outpaint_selections
+        self.outpaint_extend_times = self.cfg.scene.outpaint.outpaint_extend_times
+        # for scene refinement
+        self.mcs_n_view = self.cfg.scene.mcs.n_view
+        self.mcs_rect_w = self.cfg.scene.mcs.rect_w
+        self.mcs_iterations = self.cfg.scene.mcs.steps
+        self.mcs_gsopt_per_frame = self.cfg.scene.mcs.gsopt_iters
+        # coarse scene
+        self._resize_input(rgb_fn)
+        dir = rgb_fn[:str.rfind(rgb_fn,'/')]
+        rgb = Image.open(rgb_fn)
+        self._coarse_scene(rgb)
+        torch.cuda.empty_cache()
+        # refinement
+        self._MCS_Refinement()
+        torch.save(self.scene,f'{dir}/scene.pth')
+        self.checkor._render_video(self.scene,save_dir=f'{dir}/')

pipe/cfgs/INSTRUCT.md ADDED Viewed

	@@ -0,0 +1,18 @@

+## INSTRUCTION
+Here, we provide an explanation of some key parameters in ```pipe/cfgs/basic.yaml``` to facilitate parameter adjustments. For more details, please refer to our paper.
+- ```scene.outpaint.outpaint_extend_times```
+  - This parameter controls the outpaint ratio of the image when constructing the global scaffold. A larger value will result in smoother scene boundaries, but it may also introduce distortion. A recommended range is between 0.3 and 0.6.
+- ```scene.traj```
+    - ```.n_sample```: This parameter controls the number of warp-and-inpaint iterations. The more iterations, the higher the scene integrity (fewer holes). In most cases, a value of 10 is sufficient.
+    - ```.far_percentage``` / ```.traj_forward_ratio``` / ```.traj_backward_ratio```
+      - These parameters control the range of the camera's spiral trajectory (also the final scene) in ```ops/trajs```. Directly reconstruct a quite large scene might cause distortions.
+      - ```far_percentage``` controls the scale of the trajectory range. For large-scale scenes (especially those involving the sky or large windows), we recommend reducing this value. An example is in [this issue](https://github.com/WHU-USI3DV/VistaDream/issues/3).
+      - ```traj_forward_ratio``` and ```traj_backward_ratio``` control the forward and backward range of the camera, respectively.
+- ```scene.mcs```
+  - ```.steps``` means the MCS refine steps. We suggest a value between 8 and 15.
+  - ```.n_view``` means the number of viewpoints optimized simultaneously in MCS. On a RTX4090 (24GB), 8 is feasible.
+  - ```.rect_w``` determines the MCS control strength. We suggest 0.3-0.8.

pipe/cfgs/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from omegaconf import OmegaConf
+def load_cfg(cfg_path):
+    return OmegaConf.load(cfg_path)
+def merge_cfgs(cfg1,cfg2):
+    cfg = OmegaConf.merge(cfg1,cfg2)
+    return cfg

pipe/cfgs/basic.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+name: basic
+model:
+  sky:
+    value: 1e5 # to update
+    oneformer:
+      ckpt: 'tools/OneFormer/checkpoints/coco_pretrain_1280x1280_150_16_dinat_l_oneformer_ade20k_160k.pth'
+      yaml: 'tools/OneFormer/configs/ade20k/dinat/coco_pretrain_oneformer_dinat_large_bs16_160k_1280x1280.yaml'
+  vlm:
+    llava:
+      ckpt: 'llava-hf/bakLlava-v1-hf' # downloaded from hugging face
+  mde:
+    dpt_pro:
+      ckpt: 'tools/DepthPro/checkpoints/depth_pro.pt'
+  paint:
+    fooocus:
+      pass # it will load required checkpoints automaticly
+  optimize:
+    sd: 'sd-legacy/stable-diffusion-v1-5' # downloaded from hugging face
+    lcm: 'latent-consistency/lcm-lora-sdv1-5'
+scene:
+  input:
+    rgb: 'data/sd_readingroom/color.png'
+    resize_long_edge: 512
+  outpaint:
+    outpaint_selections: ['Left','Right','Top','Bottom']
+    outpaint_extend_times: 0.45
+  traj:
+    n_sample: 10
+    traj_type: 'spiral'
+    near_percentage: 5
+    far_percentage: 95
+    traj_forward_ratio: 0.3
+    traj_backward_ratio: 0.7
+  gaussian:
+    opt_iters_per_frame: 512
+  mcs:
+    steps: 10 # among 50 total steps
+    n_view: 8
+    rect_w: 0.7
+    gsopt_iters: 256

pipe/lvm_inpaint.py ADDED Viewed

	@@ -0,0 +1,85 @@

+'''
+render using frames in GS
+inpaint with fooocus
+'''
+import torch
+import numpy as np
+from ops.llava import Llava
+from ops.gs.basic import Frame
+# from ops.fooocus import Fooocus
+class Fooocus():
+    def __init__(self):pass
+class Inpaint_Tool():
+    def __init__(self,cfg) -> None:
+        self.cfg = cfg
+        self._load_model()
+    def _load_model(self):
+        self.fooocus = Fooocus()
+        self.llava = Llava(device='cpu',llava_ckpt=self.cfg.model.vlm.llava.ckpt)
+    def _llava_prompt(self,frame):
+        prompt = '<image>\n \
+                USER: Detaily imagine and describe the scene this image taken from? \
+                \n ASSISTANT: This image is taken from a scene of '
+        return prompt
+    def __call__(self, frame:Frame, outpaint_selections=[], outpaint_extend_times=0.0):
+        '''
+        Must be Frame type
+        '''
+        # conduct reconstuction
+        # ----------------------- LLaVA -----------------------
+        if frame.prompt is None:
+            print('Inpaint-Caption[1/3] Move llava.model to GPU...')
+            self.llava.model.to('cuda')
+            print('Inpaint-Caption[2/3] Llava inpainting instruction:')
+            query  = self._llava_prompt(frame)
+            prompt = self.llava(frame.rgb,query)
+            split  = str.rfind(prompt,'ASSISTANT: This image is taken from a scene of ') + len(f'ASSISTANT: This image is taken from a scene of ')
+            prompt = prompt[split:]
+            print(prompt)
+            print('Inpaint-Caption[3/3] Move llava.model to CPU...')
+            self.llava.model.to('cpu')
+            torch.cuda.empty_cache()
+            frame.prompt = prompt
+        else:
+            prompt = frame.prompt
+            print(f'Using pre-generated prompt: {prompt}')
+        # --------------------- Fooocus ----------------------
+        print('Inpaint-Fooocus[1/2] Fooocus inpainting...')
+        image = frame.rgb
+        mask = np.zeros_like(image,bool) if len(outpaint_selections)>0 else frame.inpaint
+        fooocus_result = self.fooocus(image_number=1,
+                            prompt=prompt + ' 8K, no large circles, no cameras, no fisheye.',
+                            negative_prompt='Any fisheye, any large circles, any blur, unrealism.',
+                            outpaint_selections=outpaint_selections,
+                            outpaint_extend_times=outpaint_extend_times,
+                            origin_image=image,
+                            mask_image=mask,)[0]
+        torch.cuda.empty_cache()
+        # reset the frame for outpainting
+        if len(outpaint_selections) > 0.:
+            assert len(outpaint_selections) == 4
+            small_H, small_W = frame.rgb.shape[0:2]
+            large_H, large_W = fooocus_result.shape[0:2]
+            if frame.intrinsic is not None:
+                # NO CHANGE TO FOCAL
+                frame.intrinsic[0,-1] = large_W//2
+                frame.intrinsic[1,-1] = large_H//2
+            # begin sample pixel
+            frame.H = large_H
+            frame.W = large_W
+            begin_H = (large_H-small_H)//2
+            begin_W = (large_W-small_W)//2
+            inpaint = np.ones_like(fooocus_result[...,0])
+            inpaint[begin_H:(begin_H+small_H),begin_W:(begin_W+small_W)] *= 0.
+            frame.inpaint = inpaint > 0.5
+        frame.rgb = fooocus_result
+        print('Inpaint-Fooocus[2/2] Assign Frame...')
+        return frame

pipe/reconstruct.py ADDED Viewed

	@@ -0,0 +1,52 @@

+'''
+Dust3R reconstrucion
+GeoWizard Estimation
+Smooth Projection
+'''
+import torch
+import PIL,cv2
+import numpy as np
+from PIL import Image
+from ops.gs.basic import Frame
+from ops.utils import *
+from ops.depth_pro import Depth_Pro_Tool
+from ops.connect import Smooth_Connect_Tool
+class Reconstruct_Tool():
+    def __init__(self,cfg) -> None:
+        self.cfg = cfg
+        self._load_model()
+        self.connector = Smooth_Connect_Tool()
+    def _load_model(self):
+        self.pro_dpt = Depth_Pro_Tool(ckpt=self.cfg.model.mde.dpt_pro.ckpt,device='cpu')
+    def _ProDpt_(self, rgb, intrinsic=None):
+        # conduct reconstruction
+        print('Pro_dpt[1/3] Move Pro_dpt.model to GPU...')
+        self.pro_dpt.to('cuda')
+        print('Pro_dpt[2/3] Pro_dpt Estimation...')
+        f_px = intrinsic[0,0] if intrinsic is not None else None
+        metric_dpt,intrinsic = self.pro_dpt(rgb,f_px)
+        print('Pro_dpt[3/3] Move Pro_dpt.model to GPU...')
+        self.pro_dpt.to('cpu')
+        torch.cuda.empty_cache()
+        edge_mask = edge_filter(metric_dpt,times=0.05)
+        return metric_dpt, intrinsic, edge_mask
+    def _Guide_ProDpt_(self, rgb, intrinsic=None, refer_dpt=None, refer_msk=None):
+        # conduct reconstruction
+        print('Pro_dpt[1/3] Move Pro_dpt.model to GPU...')
+        self.pro_dpt.to('cuda')
+        print('Pro_dpt[2/3] Pro_dpt Estimation...')
+        f_px = intrinsic[0,0] if intrinsic is not None else None
+        metric_dpt,intrinsic = self.pro_dpt(rgb,f_px=f_px)
+        metric_dpt_connect = self.connector._affine_dpt_to_GS(refer_dpt,metric_dpt,~refer_msk)
+        print('Pro_dpt[3/3] Move Pro_dpt.model to GPU...')
+        self.pro_dpt.to('cpu')
+        torch.cuda.empty_cache()
+        edge_mask = edge_filter(metric_dpt_connect,times=0.05)
+        return metric_dpt_connect, metric_dpt, intrinsic, edge_mask
+    # ------------- TODO: Metricv2 + Guide-GeoWizard ------------------ #

pipe/refine_mvdps.py ADDED Viewed

	@@ -0,0 +1,155 @@

+'''
+Coarse Gaussian Rendering -- RGB-D as init
+RGB-D add noise (MV init)
+Cycling:
+    denoise to x0 and d0 -- optimize Gaussian
+    re-rendering RGB-D
+    render RGB-D to rectified noise
+    noise rectification
+    step denoise with rectified noise
+-- Finally the Gaussian
+'''
+import torch
+import numpy as np
+from copy import deepcopy
+from ops.utils import *
+from ops.gs.train import *
+from ops.trajs import _generate_trajectory
+from ops.gs.basic import Frame,Gaussian_Scene
+class Refinement_Tool_MCS():
+    def __init__(self,
+                 coarse_GS:Gaussian_Scene,
+                 device = 'cuda',
+                 refiner = None,
+                 traj_type = 'spiral',
+                 n_view = 8,
+                 rect_w = 0.7,
+                 n_gsopt_iters = 256) -> None:
+        # input coarse GS
+        # refine frames to be refined; here we refine frames rather than gaussian paras
+        self.n_view = n_view
+        self.rect_w = rect_w
+        self.n_gsopt_iters = n_gsopt_iters
+        self.coarse_GS = coarse_GS
+        self.refine_frames: list[Frame] = []
+        # hyperparameters total is 50 steps and here is the last N steps
+        self.process_res = 512
+        self.device = device
+        self.traj_type = traj_type
+        # models
+        self.RGB_LCM = refiner
+        self.RGB_LCM.to('cuda')
+        self.steps = self.RGB_LCM.denoise_steps
+        # prompt for diffusion
+        prompt = self.coarse_GS.frames[-1].prompt
+        self.rgb_prompt_latent = self.RGB_LCM.model._encode_text_prompt(prompt)
+        # loss function
+        self.rgb_lossfunc = RGB_Loss(w_ssim=0.2)
+    def _pre_process(self):
+        # determine the diffusion target shape
+        strict_times = 32
+        origin_H = self.coarse_GS.frames[0].H
+        origin_W = self.coarse_GS.frames[0].W
+        self.target_H,self.target_W = self.process_res,self.process_res
+        # reshape to the same (target) shape for rendering and denoising
+        intrinsic = deepcopy(self.coarse_GS.frames[0].intrinsic)
+        H_ratio, W_ratio = self.target_H/origin_H, self.target_W/origin_W
+        intrinsic[0] *= W_ratio
+        intrinsic[1] *= H_ratio
+        target_H, target_W = self.target_H+2*strict_times, self.target_W+2*strict_times
+        intrinsic[0,-1] = target_W/2
+        intrinsic[1,-1] = target_H/2
+        # generate a set of cameras
+        trajs = _generate_trajectory(None,self.coarse_GS,nframes=self.n_view+2)[1:-1]
+        for i, pose in enumerate(trajs):
+            fine_frame = Frame()
+            fine_frame.H = target_H
+            fine_frame.W = target_W
+            fine_frame.extrinsic = pose
+            fine_frame.intrinsic = deepcopy(intrinsic)
+            fine_frame.prompt  = self.coarse_GS.frames[-1].prompt
+            self.refine_frames.append(fine_frame)
+        # determine inpaint mask
+        temp_scene = Gaussian_Scene()
+        temp_scene._add_trainable_frame(self.coarse_GS.frames[0],require_grad=False)
+        temp_scene._add_trainable_frame(self.coarse_GS.frames[1],require_grad=False)
+        for frame in self.refine_frames:
+            frame = temp_scene._render_for_inpaint(frame)
+    def _mv_init(self):
+        rgbs = []
+        # only for inpainted images
+        for frame in self.refine_frames:
+            # rendering at now; all in the same shape
+            render_rgb,render_dpt,render_alpha=self.coarse_GS._render_RGBD(frame)
+            # diffusion images
+            rgbs.append(render_rgb.permute(2,0,1)[None])
+        self.rgbs = torch.cat(rgbs,dim=0)
+        self.RGB_LCM._encode_mv_init_images(self.rgbs)
+    def _to_cuda(self,tensor):
+        tensor = torch.from_numpy(tensor.astype(np.float32)).to('cuda')
+        return tensor
+    def _x0_rectification(self, denoise_rgb, iters):
+        # gaussian initialization
+        CGS = deepcopy(self.coarse_GS)
+        for gf in CGS.gaussian_frames:
+            gf._require_grad(True)
+        self.refine_GS = GS_Train_Tool(CGS)
+        # rectification
+        for iter in range(iters):
+            loss = 0.
+            # supervise on input view
+            for i in range(2):
+                keep_frame :Frame = self.coarse_GS.frames[i]
+                render_rgb,render_dpt,render_alpha = self.refine_GS._render(keep_frame)
+                loss_rgb = self.rgb_lossfunc(render_rgb,self._to_cuda(keep_frame.rgb),valid_mask=keep_frame.inpaint)
+                loss += loss_rgb*len(self.refine_frames)
+            # then multiview supervision
+            for i,frame in enumerate(self.refine_frames):
+                render_rgb,render_dpt,render_alpha = self.refine_GS._render(frame)
+                loss_rgb_item = self.rgb_lossfunc(denoise_rgb[i],render_rgb)
+                loss += loss_rgb_item
+            # optimization
+            loss.backward()
+            self.refine_GS.optimizer.step()
+            self.refine_GS.optimizer.zero_grad()
+    def _step_gaussian_optimization(self,step):
+        # denoise to x0 and d0
+        with torch.no_grad():
+            # we left the last 2 steps for stronger guidances
+            rgb_t = self.RGB_LCM.timesteps[-self.steps+step]
+            rgb_t = torch.tensor([rgb_t]).to(self.device)
+            rgb_noise_pr,rgb_denoise = self.RGB_LCM._denoise_to_x0(rgb_t,self.rgb_prompt_latent)
+            rgb_denoise = rgb_denoise.permute(0,2,3,1)
+        # rendering each frames and weight-able refinement
+        self._x0_rectification(rgb_denoise,self.n_gsopt_iters)
+        return rgb_t, rgb_noise_pr
+    def _step_diffusion_rectification(self, rgb_t, rgb_noise_pr):
+        # re-rendering RGB
+        with torch.no_grad():
+            x0_rect = []
+            for i,frame in enumerate(self.refine_frames):
+                re_render_rgb,_,re_render_alpha= self.refine_GS._render(frame)
+                # avoid rasterization holes yield more block holes and more
+                x0_rect.append(re_render_rgb.permute(2,0,1)[None])
+            x0_rect = torch.cat(x0_rect,dim=0)
+        # rectification
+        self.RGB_LCM._step_denoise(rgb_t,rgb_noise_pr,x0_rect,rect_w=self.rect_w)
+    def __call__(self):
+        # warmup
+        self._pre_process()
+        self._mv_init()
+        for step in tqdm.tqdm(range(self.steps)):
+            rgb_t, rgb_noise_pr = self._step_gaussian_optimization(step)
+            self._step_diffusion_rectification(rgb_t, rgb_noise_pr)
+        scene = self.refine_GS.GS
+        for gf in scene.gaussian_frames:
+            gf._require_grad(False)
+        return scene

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+torch == 2.0.1
+torchvision == 0.15.2
+xformers == 0.0.21
+numpy
+regex
+torchmetrics
+accelerate
+gsplat
+open3d
+tqdm
+omegaconf
+opencv-python
+opencv-contrib-python
+plyfile
+timm
+wandb
+ftfy
+pillow_heif
+diffdist
+diffusers
+einops
+imageio
+imageio-ffmpeg
+transformers
+torchsde
+huggingface-hub

tools/DepthPro/ACKNOWLEDGEMENTS.md ADDED Viewed

	@@ -0,0 +1,418 @@

+Acknowledgements
+Portions of this Software may utilize the following copyrighted
+material, the use of which is hereby acknowledged.
+------------------------------------------------
+PyTorch Image Models (timm)
+Ross Wightman
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2019 Ross Wightman
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+------------------------------------------------
+DINOv2: Learning Robust Visual Features without Supervision
+Github source: https://github.com/facebookresearch/dinov2
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

tools/DepthPro/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the open source team at [[email protected]](mailto:[email protected]). All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4,
+available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html)

tools/DepthPro/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Contribution Guide
+Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducibility, and beyond its publication there are limited plans for future development of the repository.
+While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged.
+## Before you get started
+By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE).
+We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md).

tools/DepthPro/LICENSE ADDED Viewed

	@@ -0,0 +1,47 @@

+Copyright (C) 2024 Apple Inc. All Rights Reserved.
+Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple
+Inc. ("Apple") in consideration of your agreement to the following
+terms, and your use, installation, modification or redistribution of
+this Apple software constitutes acceptance of these terms.  If you do
+not agree with these terms, please do not use, install, modify or
+redistribute this Apple software.
+In consideration of your agreement to abide by the following terms, and
+subject to these terms, Apple grants you a personal, non-exclusive
+license, under Apple's copyrights in this original Apple software (the
+"Apple Software"), to use, reproduce, modify and redistribute the Apple
+Software, with or without modifications, in source and/or binary forms;
+provided that if you redistribute the Apple Software in its entirety and
+without modifications, you must retain this notice and the following
+text and disclaimers in all such redistributions of the Apple Software.
+Neither the name, trademarks, service marks or logos of Apple Inc. may
+be used to endorse or promote products derived from the Apple Software
+without specific prior written permission from Apple.  Except as
+expressly stated in this notice, no other rights or licenses, express or
+implied, are granted by Apple herein, including but not limited to any
+patent rights that may be infringed by your derivative works or by other
+works in which the Apple Software may be incorporated.
+The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
+MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
+THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
+OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
+MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
+AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
+STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-------------------------------------------------------------------------------
+SOFTWARE DISTRIBUTED IN THIS REPOSITORY:
+This software includes a number of subcomponents with separate
+copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
+-------------------------------------------------------------------------------

tools/DepthPro/README.md ADDED Viewed

	@@ -0,0 +1,97 @@

+## Depth Pro: Sharp Monocular Metric Depth in Less Than a Second
+This software project accompanies the research paper:
+**[Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073)**,
+*Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, and Vladlen Koltun*.
+![](data/depth-pro-teaser.jpg)
+We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image.
+The model in this repository is a reference implementation, which has been re-trained. Its performance is close to the model reported in the paper but does not match it exactly.
+## Getting Started
+We recommend setting up a virtual environment. Using e.g. miniconda, the `depth_pro` package can be installed via:
+```bash
+conda create -n depth-pro -y python=3.9
+conda activate depth-pro
+pip install -e .
+```
+To download pretrained checkpoints follow the code snippet below:
+```bash
+source get_pretrained_models.sh   # Files will be downloaded to `checkpoints` directory.
+```
+### Running from commandline
+We provide a helper script to directly run the model on a single image:
+```bash
+# Run prediction on a single image:
+depth-pro-run -i ./data/example.jpg
+# Run `depth-pro-run -h` for available options.
+```
+### Running from python
+```python
+from PIL import Image
+import depth_pro
+# Load model and preprocessing transform
+model, transform = depth_pro.create_model_and_transforms()
+model.eval()
+# Load and preprocess an image.
+image, _, f_px = depth_pro.load_rgb(image_path)
+image = transform(image)
+# Run inference.
+prediction = model.infer(image, f_px=f_px)
+depth = prediction["depth"]  # Depth in [m].
+focallength_px = prediction["focallength_px"]  # Focal length in pixels.
+```
+### Evaluation (boundary metrics)
+Our boundary metrics can be found under `eval/boundary_metrics.py` and used as follows:
+```python
+# for a depth-based dataset
+boundary_f1 = SI_boundary_F1(predicted_depth, target_depth)
+# for a mask-based dataset (image matting / segmentation)
+boundary_recall = SI_boundary_Recall(predicted_depth, target_mask)
+```
+## Citation
+If you find our work useful, please cite the following paper:
+```bibtex
+@article{Bochkovskii2024:arxiv,
+  author     = {Aleksei Bochkovskii and Ama\"{e}l Delaunoy and Hugo Germain and Marcel Santos and
+               Yichao Zhou and Stephan R. Richter and Vladlen Koltun}
+  title      = {Depth Pro: Sharp Monocular Metric Depth in Less Than a Second},
+  journal    = {arXiv},
+  year       = {2024},
+  url        = {https://arxiv.org/abs/2410.02073},
+}
+```
+## License
+This sample code is released under the [LICENSE](LICENSE) terms.
+The model weights are released under the [LICENSE](LICENSE) terms.
+## Acknowledgements
+Our codebase is built using multiple opensource contributions, please see [Acknowledgements](ACKNOWLEDGEMENTS.md) for more details.
+Please check the paper for a complete list of references and datasets used in this work.

tools/DepthPro/command_pro_dpt.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import src.depth_pro as depth_pro
+import numpy as np
+from PIL import Image
+from src.depth_pro.depth_pro import DepthProConfig
+class apple_pro_depth():
+    def __init__(self,device='cuda',ckpt = '/mnt/proj/SOTAs/ml-depth-pro-main/checkpoints/depth_pro.pt'):
+        self.ckpt = ckpt
+        self.device = device
+        self._load_model()
+    def _load_model(self):
+        cfg = DepthProConfig(
+            patch_encoder_preset="dinov2l16_384",
+            image_encoder_preset="dinov2l16_384",
+            checkpoint_uri=self.ckpt,
+            decoder_features=256,
+            use_fov_head=True,
+            fov_encoder_preset="dinov2l16_384",
+        )
+        self.model, self.transform = depth_pro.create_model_and_transforms(config=cfg,device=self.device)
+        self.model.eval()
+    def get_intrins(self, f, H, W):
+        new_cu = (W / 2.0) - 0.5
+        new_cv = (H / 2.0) - 0.5
+        intrins = np.array([
+            [f,         0,     new_cu  ],
+            [0,         f,     new_cv  ],
+            [0,         0,     1       ]
+        ])
+        return intrins
+    def to(self,device):
+        self.device = device
+        self.model.to(device)
+    def __call__(self, image,f_px=None):
+        if type(image) is np.ndarray:
+            if np.amax(image) < 1.1:
+                image = image*255
+            image = Image.fromarray(image.astype(np.uint8))
+        # trans
+        image = self.transform(image).to(self.device)
+        # predict
+        prediction = self.model.infer(image, f_px=f_px)
+        depth = prediction["depth"]  # Depth in [m].
+        focallength_px = prediction["focallength_px"]  # Focal length in pixels.
+        # output
+        H,W = depth.shape[0:2]
+        depth = depth.detach().cpu().numpy()
+        focallength_px = focallength_px.detach().cpu().numpy() if f_px is None else f_px
+        intrisnc = self.get_intrins(focallength_px,H,W)
+        return depth, intrisnc

tools/DepthPro/get_pretrained_models.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/usr/bin/env bash
+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+#
+mkdir -p checkpoints
+# Place final weights here:
+wget https://ml-site.cdn-apple.com/models/depth-pro/depth_pro.pt -P checkpoints

tools/DepthPro/pyproject.toml ADDED Viewed

	@@ -0,0 +1,59 @@

+[project]
+name = "depth_pro"
+version = "0.1"
+description = "Inference/Network/Model code for Apple Depth Pro monocular depth estimation."
+readme = "README.md"
+dependencies = [
+    "torch",
+    "torchvision",
+    "timm",
+    "numpy<2",
+    "pillow_heif",
+    "matplotlib",
+]
+[project.scripts]
+depth-pro-run = "depth_pro.cli:run_main"
+[project.urls]
+Homepage = "https://github.com/apple/ml-depth-pro"
+Repository = "https://github.com/apple/ml-depth-pro"
+[build-system]
+requires = ["setuptools", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.pyright]
+include = ["src"]
+exclude = [
+    "**/node_modules",
+    "**/__pycache__",
+]
+pythonVersion = "3.9"
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = [
+    "tests"
+]
+filterwarnings = [
+    "ignore::DeprecationWarning"
+]
+[tool.lint.per-file-ignores]
+"__init__.py" = ["F401", "D100", "D104"]
+[tool.ruff]
+line-length = 100
+lint.select = ["E", "F", "D", "I"]
+lint.ignore = ["D100", "D105"]
+extend-exclude = [
+    "*external*",
+    "third_party",
+]
+src = ["depth_pro", "tests"]
+target-version = "py39"

tools/DepthPro/src/depth_pro/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""Depth Pro package."""
+from .depth_pro import create_model_and_transforms  # noqa
+from .utils import load_rgb  # noqa

tools/DepthPro/src/depth_pro/cli/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""Depth Pro CLI and tools."""
+from .run import main as run_main  # noqa

tools/DepthPro/src/depth_pro/cli/run.py ADDED Viewed

	@@ -0,0 +1,154 @@

+#!/usr/bin/env python3
+"""Sample script to run DepthPro.
+Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""
+import argparse
+import logging
+from pathlib import Path
+import numpy as np
+import PIL.Image
+import torch
+from matplotlib import pyplot as plt
+from tqdm import tqdm
+from depth_pro import create_model_and_transforms, load_rgb
+LOGGER = logging.getLogger(__name__)
+def get_torch_device() -> torch.device:
+    """Get the Torch device."""
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda:0")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    return device
+def run(args):
+    """Run Depth Pro on a sample image."""
+    if args.verbose:
+        logging.basicConfig(level=logging.INFO)
+    # Load model.
+    model, transform = create_model_and_transforms(
+        device=get_torch_device(),
+        precision=torch.half,
+    )
+    model.eval()
+    image_paths = [args.image_path]
+    if args.image_path.is_dir():
+        image_paths = args.image_path.glob("**/*")
+        relative_path = args.image_path
+    else:
+        relative_path = args.image_path.parent
+    if not args.skip_display:
+        plt.ion()
+        fig = plt.figure()
+        ax_rgb = fig.add_subplot(121)
+        ax_disp = fig.add_subplot(122)
+    for image_path in tqdm(image_paths):
+        # Load image and focal length from exif info (if found.).
+        try:
+            LOGGER.info(f"Loading image {image_path} ...")
+            image, _, f_px = load_rgb(image_path)
+        except Exception as e:
+            LOGGER.error(str(e))
+            continue
+        # Run prediction. If `f_px` is provided, it is used to estimate the final metric depth,
+        # otherwise the model estimates `f_px` to compute the depth metricness.
+        prediction = model.infer(transform(image), f_px=f_px)
+        # Extract the depth and focal length.
+        depth = prediction["depth"].detach().cpu().numpy().squeeze()
+        if f_px is not None:
+            LOGGER.debug(f"Focal length (from exif): {f_px:0.2f}")
+        elif prediction["focallength_px"] is not None:
+            focallength_px = prediction["focallength_px"].detach().cpu().item()
+            LOGGER.info(f"Estimated focal length: {focallength_px}")
+        inverse_depth = 1 / depth
+        # Visualize inverse depth instead of depth, clipped to [0.1m;250m] range for better visualization.
+        max_invdepth_vizu = min(inverse_depth.max(), 1 / 0.1)
+        min_invdepth_vizu = max(1 / 250, inverse_depth.min())
+        inverse_depth_normalized = (inverse_depth - min_invdepth_vizu) / (
+            max_invdepth_vizu - min_invdepth_vizu
+        )
+        # Save Depth as npz file.
+        if args.output_path is not None:
+            output_file = (
+                args.output_path
+                / image_path.relative_to(relative_path).parent
+                / image_path.stem
+            )
+            LOGGER.info(f"Saving depth map to: {str(output_file)}")
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            np.savez_compressed(output_file, depth=depth)
+            # Save as color-mapped "turbo" jpg image.
+            cmap = plt.get_cmap("turbo")
+            color_depth = (cmap(inverse_depth_normalized)[..., :3] * 255).astype(
+                np.uint8
+            )
+            color_map_output_file = str(output_file) + ".jpg"
+            LOGGER.info(f"Saving color-mapped depth to: : {color_map_output_file}")
+            PIL.Image.fromarray(color_depth).save(
+                color_map_output_file, format="JPEG", quality=90
+            )
+        # Display the image and estimated depth map.
+        if not args.skip_display:
+            ax_rgb.imshow(image)
+            ax_disp.imshow(inverse_depth_normalized, cmap="turbo")
+            fig.canvas.draw()
+            fig.canvas.flush_events()
+    LOGGER.info("Done predicting depth!")
+    if not args.skip_display:
+        plt.show(block=True)
+def main():
+    """Run DepthPro inference example."""
+    parser = argparse.ArgumentParser(
+        description="Inference scripts of DepthPro with PyTorch models."
+    )
+    parser.add_argument(
+        "-i",
+        "--image-path",
+        type=Path,
+        default="./data/example.jpg",
+        help="Path to input image.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-path",
+        type=Path,
+        help="Path to store output files.",
+    )
+    parser.add_argument(
+        "--skip-display",
+        action="store_true",
+        help="Skip matplotlib display.",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Show verbose output."
+    )
+    run(parser.parse_args())
+if __name__ == "__main__":
+    main()

tools/DepthPro/src/depth_pro/depth_pro.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# Depth Pro: Sharp Monocular Metric Depth in Less Than a Second
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Mapping, Optional, Tuple, Union
+import torch
+from torch import nn
+from torchvision.transforms import (
+    Compose,
+    ConvertImageDtype,
+    Lambda,
+    Normalize,
+    ToTensor,
+)
+from .network.decoder import MultiresConvDecoder
+from .network.encoder import DepthProEncoder
+from .network.fov import FOVNetwork
+from .network.vit_factory import VIT_CONFIG_DICT, ViTPreset, create_vit
+@dataclass
+class DepthProConfig:
+    """Configuration for DepthPro."""
+    patch_encoder_preset: ViTPreset
+    image_encoder_preset: ViTPreset
+    decoder_features: int
+    checkpoint_uri: Optional[str] = None
+    fov_encoder_preset: Optional[ViTPreset] = None
+    use_fov_head: bool = True
+DEFAULT_MONODEPTH_CONFIG_DICT = DepthProConfig(
+    patch_encoder_preset="dinov2l16_384",
+    image_encoder_preset="dinov2l16_384",
+    checkpoint_uri="./checkpoints/depth_pro.pt",
+    decoder_features=256,
+    use_fov_head=True,
+    fov_encoder_preset="dinov2l16_384",
+)
+def create_backbone_model(
+    preset: ViTPreset
+) -> Tuple[nn.Module, ViTPreset]:
+    """Create and load a backbone model given a config.
+    Args:
+    ----
+        preset: A backbone preset to load pre-defind configs.
+    Returns:
+    -------
+        A Torch module and the associated config.
+    """
+    if preset in VIT_CONFIG_DICT:
+        config = VIT_CONFIG_DICT[preset]
+        model = create_vit(preset=preset, use_pretrained=False)
+    else:
+        raise KeyError(f"Preset {preset} not found.")
+    return model, config
+def create_model_and_transforms(
+    config: DepthProConfig = DEFAULT_MONODEPTH_CONFIG_DICT,
+    device: torch.device = torch.device("cpu"),
+    precision: torch.dtype = torch.float32,
+) -> Tuple[DepthPro, Compose]:
+    """Create a DepthPro model and load weights from `config.checkpoint_uri`.
+    Args:
+    ----
+        config: The configuration for the DPT model architecture.
+        device: The optional Torch device to load the model onto, default runs on "cpu".
+        precision: The optional precision used for the model, default is FP32.
+    Returns:
+    -------
+        The Torch DepthPro model and associated Transform.
+    """
+    patch_encoder, patch_encoder_config = create_backbone_model(
+        preset=config.patch_encoder_preset
+    )
+    image_encoder, _ = create_backbone_model(
+        preset=config.image_encoder_preset
+    )
+    fov_encoder = None
+    if config.use_fov_head and config.fov_encoder_preset is not None:
+        fov_encoder, _ = create_backbone_model(preset=config.fov_encoder_preset)
+    dims_encoder = patch_encoder_config.encoder_feature_dims
+    hook_block_ids = patch_encoder_config.encoder_feature_layer_ids
+    encoder = DepthProEncoder(
+        dims_encoder=dims_encoder,
+        patch_encoder=patch_encoder,
+        image_encoder=image_encoder,
+        hook_block_ids=hook_block_ids,
+        decoder_features=config.decoder_features,
+    )
+    decoder = MultiresConvDecoder(
+        dims_encoder=[config.decoder_features] + list(encoder.dims_encoder),
+        dim_decoder=config.decoder_features,
+    )
+    model = DepthPro(
+        encoder=encoder,
+        decoder=decoder,
+        last_dims=(32, 1),
+        use_fov_head=config.use_fov_head,
+        fov_encoder=fov_encoder,
+    ).to(device)
+    if precision == torch.half:
+        model.half()
+    transform = Compose(
+        [
+            ToTensor(),
+            Lambda(lambda x: x.to(device)),
+            Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
+            ConvertImageDtype(precision),
+        ]
+    )
+    if config.checkpoint_uri is not None:
+        state_dict = torch.load(config.checkpoint_uri, map_location="cpu")
+        missing_keys, unexpected_keys = model.load_state_dict(
+            state_dict=state_dict, strict=True
+        )
+        if len(unexpected_keys) != 0:
+            raise KeyError(
+                f"Found unexpected keys when loading monodepth: {unexpected_keys}"
+            )
+        # fc_norm is only for the classification head,
+        # which we would not use. We only use the encoding.
+        missing_keys = [key for key in missing_keys if "fc_norm" not in key]
+        if len(missing_keys) != 0:
+            raise KeyError(f"Keys are missing when loading monodepth: {missing_keys}")
+    return model, transform
+class DepthPro(nn.Module):
+    """DepthPro network."""
+    def __init__(
+        self,
+        encoder: DepthProEncoder,
+        decoder: MultiresConvDecoder,
+        last_dims: tuple[int, int],
+        use_fov_head: bool = True,
+        fov_encoder: Optional[nn.Module] = None,
+    ):
+        """Initialize DepthPro.
+        Args:
+        ----
+            encoder: The DepthProEncoder backbone.
+            decoder: The MultiresConvDecoder decoder.
+            last_dims: The dimension for the last convolution layers.
+            use_fov_head: Whether to use the field-of-view head.
+            fov_encoder: A separate encoder for the field of view.
+        """
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        dim_decoder = decoder.dim_decoder
+        self.head = nn.Sequential(
+            nn.Conv2d(
+                dim_decoder, dim_decoder // 2, kernel_size=3, stride=1, padding=1
+            ),
+            nn.ConvTranspose2d(
+                in_channels=dim_decoder // 2,
+                out_channels=dim_decoder // 2,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+            ),
+            nn.Conv2d(
+                dim_decoder // 2,
+                last_dims[0],
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            ),
+            nn.ReLU(True),
+            nn.Conv2d(last_dims[0], last_dims[1], kernel_size=1, stride=1, padding=0),
+            nn.ReLU(),
+        )
+        # Set the final convolution layer's bias to be 0.
+        self.head[4].bias.data.fill_(0)
+        # Set the FOV estimation head.
+        if use_fov_head:
+            self.fov = FOVNetwork(num_features=dim_decoder, fov_encoder=fov_encoder)
+    @property
+    def img_size(self) -> int:
+        """Return the internal image size of the network."""
+        return self.encoder.img_size
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Decode by projection and fusion of multi-resolution encodings.
+        Args:
+        ----
+            x (torch.Tensor): Input image.
+        Returns:
+        -------
+            The canonical inverse depth map [m] and the optional estimated field of view [deg].
+        """
+        _, _, H, W = x.shape
+        assert H == self.img_size and W == self.img_size
+        encodings = self.encoder(x)
+        features, features_0 = self.decoder(encodings)
+        canonical_inverse_depth = self.head(features)
+        fov_deg = None
+        if hasattr(self, "fov"):
+            fov_deg = self.fov.forward(x, features_0.detach())
+        return canonical_inverse_depth, fov_deg
+    @torch.no_grad()
+    def infer(
+        self,
+        x: torch.Tensor,
+        f_px: Optional[Union[float, torch.Tensor]] = None,
+        interpolation_mode="bilinear",
+    ) -> Mapping[str, torch.Tensor]:
+        """Infer depth and fov for a given image.
+        If the image is not at network resolution, it is resized to 1536x1536 and
+        the estimated depth is resized to the original image resolution.
+        Note: if the focal length is given, the estimated value is ignored and the provided
+        focal length is use to generate the metric depth values.
+        Args:
+        ----
+            x (torch.Tensor): Input image
+            f_px (torch.Tensor): Optional focal length in pixels corresponding to `x`.
+            interpolation_mode (str): Interpolation function for downsampling/upsampling.
+        Returns:
+        -------
+            Tensor dictionary (torch.Tensor): depth [m], focallength [pixels].
+        """
+        if len(x.shape) == 3:
+            x = x.unsqueeze(0)
+        _, _, H, W = x.shape
+        resize = H != self.img_size or W != self.img_size
+        if resize:
+            x = nn.functional.interpolate(
+                x,
+                size=(self.img_size, self.img_size),
+                mode=interpolation_mode,
+                align_corners=False,
+            )
+        canonical_inverse_depth, fov_deg = self.forward(x)
+        if f_px is None:
+            f_px = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_deg.to(torch.float)))
+        inverse_depth = canonical_inverse_depth * (W / f_px)
+        f_px = f_px.squeeze()
+        if resize:
+            inverse_depth = nn.functional.interpolate(
+                inverse_depth, size=(H, W), mode=interpolation_mode, align_corners=False
+            )
+        depth = 1.0 / torch.clamp(inverse_depth, min=1e-4, max=1e4)
+        return {
+            "depth": depth.squeeze(),
+            "focallength_px": f_px,
+        }

tools/DepthPro/src/depth_pro/eval/boundary_metrics.py ADDED Viewed

	@@ -0,0 +1,332 @@

+from typing import List, Tuple
+import numpy as np
+def connected_component(r: np.ndarray, c: np.ndarray) -> List[List[int]]:
+    """Find connected components in the given row and column indices.
+    Args:
+    ----
+        r (np.ndarray): Row indices.
+        c (np.ndarray): Column indices.
+    Yields:
+    ------
+        List[int]: Indices of connected components.
+    """
+    indices = [0]
+    for i in range(1, r.size):
+        if r[i] == r[indices[-1]] and c[i] == c[indices[-1]] + 1:
+            indices.append(i)
+        else:
+            yield indices
+            indices = [i]
+    yield indices
+def nms_horizontal(ratio: np.ndarray, threshold: float) -> np.ndarray:
+    """Apply Non-Maximum Suppression (NMS) horizontally on the given ratio matrix.
+    Args:
+    ----
+        ratio (np.ndarray): Input ratio matrix.
+        threshold (float): Threshold for NMS.
+    Returns:
+    -------
+        np.ndarray: Binary mask after applying NMS.
+    """
+    mask = np.zeros_like(ratio, dtype=bool)
+    r, c = np.nonzero(ratio > threshold)
+    if len(r) == 0:
+        return mask
+    for ids in connected_component(r, c):
+        values = [ratio[r[i], c[i]] for i in ids]
+        mi = np.argmax(values)
+        mask[r[ids[mi]], c[ids[mi]]] = True
+    return mask
+def nms_vertical(ratio: np.ndarray, threshold: float) -> np.ndarray:
+    """Apply Non-Maximum Suppression (NMS) vertically on the given ratio matrix.
+    Args:
+    ----
+        ratio (np.ndarray): Input ratio matrix.
+        threshold (float): Threshold for NMS.
+    Returns:
+    -------
+        np.ndarray: Binary mask after applying NMS.
+    """
+    return np.transpose(nms_horizontal(np.transpose(ratio), threshold))
+def fgbg_depth(
+    d: np.ndarray, t: float
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels.
+    Args:
+    ----
+        d (np.ndarray): Depth matrix.
+        t (float): Threshold for comparison.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations.
+    """
+    right_is_big_enough = (d[..., :, 1:] / d[..., :, :-1]) > t
+    left_is_big_enough = (d[..., :, :-1] / d[..., :, 1:]) > t
+    bottom_is_big_enough = (d[..., 1:, :] / d[..., :-1, :]) > t
+    top_is_big_enough = (d[..., :-1, :] / d[..., 1:, :]) > t
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+def fgbg_depth_thinned(
+    d: np.ndarray, t: float
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels with Non-Maximum Suppression.
+    Args:
+    ----
+        d (np.ndarray): Depth matrix.
+        t (float): Threshold for NMS.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations with NMS applied.
+    """
+    right_is_big_enough = nms_horizontal(d[..., :, 1:] / d[..., :, :-1], t)
+    left_is_big_enough = nms_horizontal(d[..., :, :-1] / d[..., :, 1:], t)
+    bottom_is_big_enough = nms_vertical(d[..., 1:, :] / d[..., :-1, :], t)
+    top_is_big_enough = nms_vertical(d[..., :-1, :] / d[..., 1:, :], t)
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+def fgbg_binary_mask(
+    d: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels in binary masks.
+    Args:
+    ----
+        d (np.ndarray): Binary depth matrix.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations in binary masks.
+    """
+    assert d.dtype == bool
+    right_is_big_enough = d[..., :, 1:] & ~d[..., :, :-1]
+    left_is_big_enough = d[..., :, :-1] & ~d[..., :, 1:]
+    bottom_is_big_enough = d[..., 1:, :] & ~d[..., :-1, :]
+    top_is_big_enough = d[..., :-1, :] & ~d[..., 1:, :]
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+def edge_recall_matting(pr: np.ndarray, gt: np.ndarray, t: float) -> float:
+    """Calculate edge recall for image matting.
+    Args:
+    ----
+        pr (np.ndarray): Predicted depth matrix.
+        gt (np.ndarray): Ground truth binary mask.
+        t (float): Threshold for NMS.
+    Returns:
+    -------
+        float: Edge recall value.
+    """
+    assert gt.dtype == bool
+    ap, bp, cp, dp = fgbg_depth_thinned(pr, t)
+    ag, bg, cg, dg = fgbg_binary_mask(gt)
+    return 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ag), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bg), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cg), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dg), 1)
+    )
+def boundary_f1(
+    pr: np.ndarray,
+    gt: np.ndarray,
+    t: float,
+    return_p: bool = False,
+    return_r: bool = False,
+) -> float:
+    """Calculate Boundary F1 score.
+    Args:
+    ----
+        pr (np.ndarray): Predicted depth matrix.
+        gt (np.ndarray): Ground truth depth matrix.
+        t (float): Threshold for comparison.
+        return_p (bool, optional): If True, return precision. Defaults to False.
+        return_r (bool, optional): If True, return recall. Defaults to False.
+    Returns:
+    -------
+        float: Boundary F1 score, or precision, or recall depending on the flags.
+    """
+    ap, bp, cp, dp = fgbg_depth(pr, t)
+    ag, bg, cg, dg = fgbg_depth(gt, t)
+    r = 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ag), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bg), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cg), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dg), 1)
+    )
+    p = 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ap), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bp), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cp), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dp), 1)
+    )
+    if r + p == 0:
+        return 0.0
+    if return_p:
+        return p
+    if return_r:
+        return r
+    return 2 * (r * p) / (r + p)
+def get_thresholds_and_weights(
+    t_min: float, t_max: float, N: int
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate thresholds and weights for the given range.
+    Args:
+    ----
+        t_min (float): Minimum threshold.
+        t_max (float): Maximum threshold.
+        N (int): Number of thresholds.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray]: Array of thresholds and corresponding weights.
+    """
+    thresholds = np.linspace(t_min, t_max, N)
+    weights = thresholds / thresholds.sum()
+    return thresholds, weights
+def invert_depth(depth: np.ndarray, eps: float = 1e-6) -> np.ndarray:
+    """Inverts a depth map with numerical stability.
+    Args:
+    ----
+        depth (np.ndarray): Depth map to be inverted.
+        eps (float): Minimum value to avoid division by zero (default is 1e-6).
+    Returns:
+    -------
+    np.ndarray: Inverted depth map.
+    """
+    inverse_depth = 1.0 / depth.clip(min=eps)
+    return inverse_depth
+def SI_boundary_F1(
+    predicted_depth: np.ndarray,
+    target_depth: np.ndarray,
+    t_min: float = 1.05,
+    t_max: float = 1.25,
+    N: int = 10,
+) -> float:
+    """Calculate Scale-Invariant Boundary F1 Score for depth-based ground-truth.
+    Args:
+    ----
+        predicted_depth (np.ndarray): Predicted depth matrix.
+        target_depth (np.ndarray): Ground truth depth matrix.
+        t_min (float, optional): Minimum threshold. Defaults to 1.05.
+        t_max (float, optional): Maximum threshold. Defaults to 1.25.
+        N (int, optional): Number of thresholds. Defaults to 10.
+    Returns:
+    -------
+        float: Scale-Invariant Boundary F1 Score.
+    """
+    assert predicted_depth.ndim == target_depth.ndim == 2
+    thresholds, weights = get_thresholds_and_weights(t_min, t_max, N)
+    f1_scores = np.array(
+        [
+            boundary_f1(invert_depth(predicted_depth), invert_depth(target_depth), t)
+            for t in thresholds
+        ]
+    )
+    return np.sum(f1_scores * weights)
+def SI_boundary_Recall(
+    predicted_depth: np.ndarray,
+    target_mask: np.ndarray,
+    t_min: float = 1.05,
+    t_max: float = 1.25,
+    N: int = 10,
+    alpha_threshold: float = 0.1,
+) -> float:
+    """Calculate Scale-Invariant Boundary Recall Score for mask-based ground-truth.
+    Args:
+    ----
+        predicted_depth (np.ndarray): Predicted depth matrix.
+        target_mask (np.ndarray): Ground truth binary mask.
+        t_min (float, optional): Minimum threshold. Defaults to 1.05.
+        t_max (float, optional): Maximum threshold. Defaults to 1.25.
+        N (int, optional): Number of thresholds. Defaults to 10.
+        alpha_threshold (float, optional): Threshold for alpha masking. Defaults to 0.1.
+    Returns:
+    -------
+        float: Scale-Invariant Boundary Recall Score.
+    """
+    assert predicted_depth.ndim == target_mask.ndim == 2
+    thresholds, weights = get_thresholds_and_weights(t_min, t_max, N)
+    thresholded_target = target_mask > alpha_threshold
+    recall_scores = np.array(
+        [
+            edge_recall_matting(
+                invert_depth(predicted_depth), thresholded_target, t=float(t)
+            )
+            for t in thresholds
+        ]
+    )
+    weighted_recall = np.sum(recall_scores * weights)
+    return weighted_recall

tools/DepthPro/src/depth_pro/eval/dis5k_sample_list.txt ADDED Viewed

	@@ -0,0 +1,200 @@

+DIS5K/DIS-TE1/im/12#Graphics#4#TrafficSign#8245751856_821be14f86_o.jpg
+DIS5K/DIS-TE1/im/13#Insect#4#Butterfly#16023994688_7ff8cdccb1_o.jpg
+DIS5K/DIS-TE1/im/14#Kitchenware#4#Kitchenware#IMG_20210520_205538.jpg
+DIS5K/DIS-TE1/im/14#Kitchenware#8#SweetStand#4848284981_fc90f54b50_o.jpg
+DIS5K/DIS-TE1/im/17#Non-motor Vehicle#4#Cart#15012855035_d10b57014f_o.jpg
+DIS5K/DIS-TE1/im/2#Aircraft#5#Kite#13104545564_5afceec9bd_o.jpg
+DIS5K/DIS-TE1/im/20#Sports#10#Skateboarding#8472763540_bb2390e928_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#14#Sword#32473146960_dcc6b77848_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#15#Tapeline#9680492386_2d2020f282_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#4#Flag#507752845_ef852100f0_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#6#Key#11966089533_3becd78b44_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#8#Scale#31946428472_d28def471b_o.jpg
+DIS5K/DIS-TE1/im/22#Weapon#4#Rifle#8472656430_3eb908b211_o.jpg
+DIS5K/DIS-TE1/im/8#Electronics#3#Earphone#1177468301_641df8c267_o.jpg
+DIS5K/DIS-TE1/im/8#Electronics#9#MusicPlayer#2235782872_7d47847bb4_o.jpg
+DIS5K/DIS-TE2/im/11#Furniture#13#Ladder#3878434417_2ed740586e_o.jpg
+DIS5K/DIS-TE2/im/13#Insect#1#Ant#27047700955_3b3a1271f8_o.jpg
+DIS5K/DIS-TE2/im/13#Insect#11#Spider#5567179191_38d1f65589_o.jpg
+DIS5K/DIS-TE2/im/13#Insect#8#Locust#5237933769_e6687c05e4_o.jpg
+DIS5K/DIS-TE2/im/14#Kitchenware#2#DishRack#70838854_40cf689da7_o.jpg
+DIS5K/DIS-TE2/im/14#Kitchenware#8#SweetStand#8467929412_fef7f4275d_o.jpg
+DIS5K/DIS-TE2/im/16#Music Instrument#2#Harp#28058219806_28e05ff24a_o.jpg
+DIS5K/DIS-TE2/im/17#Non-motor Vehicle#1#BabyCarriage#29794777180_2e1695a0cf_o.jpg
+DIS5K/DIS-TE2/im/19#Ship#3#Sailboat#22442908623_5977e3becf_o.jpg
+DIS5K/DIS-TE2/im/2#Aircraft#5#Kite#44654358051_1400e71cc4_o.jpg
+DIS5K/DIS-TE2/im/21#Tool#11#Stand#IMG_20210520_205442.jpg
+DIS5K/DIS-TE2/im/21#Tool#17#Tripod#9318977876_34615ec9a0_o.jpg
+DIS5K/DIS-TE2/im/5#Artifact#3#Handcraft#50860882577_8482143b1b_o.jpg
+DIS5K/DIS-TE2/im/8#Electronics#10#Robot#3093360210_fee54dc5c5_o.jpg
+DIS5K/DIS-TE2/im/8#Electronics#6#Microphone#47411477652_6da66cbc10_o.jpg
+DIS5K/DIS-TE3/im/14#Kitchenware#4#Kitchenware#2451122898_ef883175dd_o.jpg
+DIS5K/DIS-TE3/im/15#Machine#4#SewingMachine#9311164128_97ba1d3947_o.jpg
+DIS5K/DIS-TE3/im/16#Music Instrument#2#Harp#7670920550_59e992fd7b_o.jpg
+DIS5K/DIS-TE3/im/17#Non-motor Vehicle#1#BabyCarriage#8389984877_1fddf8715c_o.jpg
+DIS5K/DIS-TE3/im/17#Non-motor Vehicle#3#Carriage#5947122724_98e0fc3d1f_o.jpg
+DIS5K/DIS-TE3/im/2#Aircraft#2#Balloon#2487168092_641505883f_o.jpg
+DIS5K/DIS-TE3/im/2#Aircraft#4#Helicopter#8401177591_06c71c8df2_o.jpg
+DIS5K/DIS-TE3/im/20#Sports#1#Archery#12520003103_faa43ea3e0_o.jpg
+DIS5K/DIS-TE3/im/21#Tool#11#Stand#IMG_20210709_221507.jpg
+DIS5K/DIS-TE3/im/21#Tool#2#Clip#5656649687_63d0c6696d_o.jpg
+DIS5K/DIS-TE3/im/21#Tool#6#Key#12878459244_6387a140ea_o.jpg
+DIS5K/DIS-TE3/im/3#Aquatic#1#Lobster#109214461_f52b4b6093_o.jpg
+DIS5K/DIS-TE3/im/4#Architecture#19#Windmill#20195851863_2627117e0e_o.jpg
+DIS5K/DIS-TE3/im/5#Artifact#2#Cage#5821476369_ea23927487_o.jpg
+DIS5K/DIS-TE3/im/8#Electronics#7#MobileHolder#49732997896_7f53c290b5_o.jpg
+DIS5K/DIS-TE4/im/13#Insect#6#Centipede#15302179708_a267850881_o.jpg
+DIS5K/DIS-TE4/im/17#Non-motor Vehicle#11#Tricycle#5771069105_a3aef6f665_o.jpg
+DIS5K/DIS-TE4/im/17#Non-motor Vehicle#2#Bicycle#4245936196_fdf812dcb7_o.jpg
+DIS5K/DIS-TE4/im/17#Non-motor Vehicle#9#ShoppingCart#4674052920_a5b7a2b236_o.jpg
+DIS5K/DIS-TE4/im/18#Plant#1#Bonsai#3539420884_ca8973e2c0_o.jpg
+DIS5K/DIS-TE4/im/2#Aircraft#6#Parachute#33590416634_9d6f2325e7_o.jpg
+DIS5K/DIS-TE4/im/20#Sports#1#Archery#46924476515_0be1caa684_o.jpg
+DIS5K/DIS-TE4/im/20#Sports#8#Racket#19337607166_dd1985fb59_o.jpg
+DIS5K/DIS-TE4/im/21#Tool#6#Key#3193329588_839b0c74ce_o.jpg
+DIS5K/DIS-TE4/im/5#Artifact#2#Cage#5821886526_0573ba2d0d_o.jpg
+DIS5K/DIS-TE4/im/5#Artifact#3#Handcraft#50105138282_3c1d02c968_o.jpg
+DIS5K/DIS-TE4/im/8#Electronics#1#Antenna#4305034305_874f21a701_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#1#Bag#15554964549_3105e51b6f_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#1#Bag#41104261980_098a6c4a56_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#2#Clothes#2284764037_871b2e8ca4_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#1824643784_70d0134156_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#3590020230_37b09a29b3_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#4809652879_4da8a69f3b_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#792204934_f9b28f99b4_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#5#Jewelry#13909132974_c4750c5fb7_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#7#Shoe#2483391615_9199ece8d6_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#8#Watch#4343266960_f6633b029b_o.jpg
+DIS5K/DIS-TR/im/10#Frame#2#BicycleFrame#17897573_42964dd104_o.jpg
+DIS5K/DIS-TR/im/10#Frame#5#Rack#15898634812_64807069ff_o.jpg
+DIS5K/DIS-TR/im/10#Frame#5#Rack#23928546819_c184cb0b60_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#19#Shower#6189119596_77bcfe80ee_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#2#Bench#3263647075_9306e280b5_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#5#CoatHanger#12774091054_cd5ff520ef_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#6#DentalChair#13878156865_d0439dcb32_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#9#Easel#5861024714_2070cd480c_o.jpg
+DIS5K/DIS-TR/im/12#Graphics#4#TrafficSign#40621867334_f3c32ec189_o.jpg
+DIS5K/DIS-TR/im/13#Insect#1#Ant#3295038190_db5dd0d4f4_o.jpg
+DIS5K/DIS-TR/im/13#Insect#10#Mosquito#24341339_a88a1dad4c_o.jpg
+DIS5K/DIS-TR/im/13#Insect#11#Spider#27171518270_63b78069ff_o.jpg
+DIS5K/DIS-TR/im/13#Insect#11#Spider#49925050281_fa727c154e_o.jpg
+DIS5K/DIS-TR/im/13#Insect#2#Beatle#279616486_2f1e64f591_o.jpg
+DIS5K/DIS-TR/im/13#Insect#3#Bee#43892067695_82cf3e536b_o.jpg
+DIS5K/DIS-TR/im/13#Insect#6#Centipede#20874281788_3e15c90a1c_o.jpg
+DIS5K/DIS-TR/im/13#Insect#7#Dragonfly#14106671120_1b824d77e4_o.jpg
+DIS5K/DIS-TR/im/13#Insect#8#Locust#21637491048_676ef7c9f7_o.jpg
+DIS5K/DIS-TR/im/13#Insect#9#Mantis#1381120202_9dff6987b2_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#1#Cup#12812517473_327d6474b8_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#10#WineGlass#6402491641_389275d4d1_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#3#Hydrovalve#3129932040_8c05825004_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#4#Kitchenware#2881934780_87d5218ebb_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#4#Kitchenware#IMG_20210520_205527.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#6#Spoon#32989113501_b69eccf0df_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#8#SweetStand#2867322189_c56d1e0b87_o.jpg
+DIS5K/DIS-TR/im/15#Machine#1#Gear#19217846720_f5f2807475_o.jpg
+DIS5K/DIS-TR/im/15#Machine#2#Machine#1620160659_9571b7a7ab_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#2#Harp#6012801603_1a6e2c16a6_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#5#Trombone#8683292118_d223c17ccb_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#6#Trumpet#8393262740_b8c216142c_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#8#Violin#1511267391_40e4949d68_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#1#BabyCarriage#6989512997_38b3dbc88b_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#14627183228_b2d68cf501_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#2932226475_1b2403e549_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#5420155648_86459905b8_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#2#Bicycle#IMG_20210513_134904.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#3#Carriage#3311962551_6f211b7bd6_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#4#Cart#2609732026_baf7fff3a1_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#5#Handcart#5821282211_201cefeaf2_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#7#Mower#5779003232_3bb3ae531a_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#9#ShoppingCart#10051622843_ace07e32b8_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#9#ShoppingCart#8075259294_f23e243849_o.jpg
+DIS5K/DIS-TR/im/18#Plant#2#Tree#44800999741_e377e16dbb_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#1#Airplane#2631761913_3ac67d0223_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#1#Airplane#37707911566_e908a261b6_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#3#HangGlider#2557220131_b8506920c5_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#4#Helicopter#6215659280_5dbd9b4546_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#6#Parachute#20185790493_e56fcaf8c6_o.jpg
+DIS5K/DIS-TR/im/20#Sports#1#Archery#3871269982_ae4c59a7eb_o.jpg
+DIS5K/DIS-TR/im/20#Sports#9#RockClimbing#9662433268_51299bc50e_o.jpg
+DIS5K/DIS-TR/im/21#Tool#14#Sword#26258479365_2950d7fa37_o.jpg
+DIS5K/DIS-TR/im/21#Tool#15#Tapeline#15505703447_e0fdeaa5a6_o.jpg
+DIS5K/DIS-TR/im/21#Tool#4#Flag#26678602024_9b665742de_o.jpg
+DIS5K/DIS-TR/im/21#Tool#4#Flag#5774823110_d603ce3cc8_o.jpg
+DIS5K/DIS-TR/im/21#Tool#5#Hook#6867989814_dba18d673c_o.jpg
+DIS5K/DIS-TR/im/22#Weapon#4#Rifle#4451713125_cd91719189_o.jpg
+DIS5K/DIS-TR/im/3#Aquatic#2#Seadragon#4910944581_913139b238_o.jpg
+DIS5K/DIS-TR/im/4#Architecture#12#Scaffold#3661448960_8aff24cc4d_o.jpg
+DIS5K/DIS-TR/im/4#Architecture#13#Sculpture#6385318715_9a88d4eba7_o.jpg
+DIS5K/DIS-TR/im/4#Architecture#17#Well#5011603479_75cf42808a_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#2#Cage#4892828841_7f1bc05682_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#15404211628_9e9ff2ce2e_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#3200169865_7c84cfcccf_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#5859295071_c217e7c22f_o.jpg
+DIS5K/DIS-TR/im/6#Automobile#10#SteeringWheel#17200338026_f1e2122d8e_o.jpg
+DIS5K/DIS-TR/im/6#Automobile#3#Car#3780893425_1a7d275e09_o.jpg
+DIS5K/DIS-TR/im/6#Automobile#5#Crane#15282506502_1b1132a7c3_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#16767791875_8e6df41752_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#3291433361_38747324c4_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#4195104238_12a754c61a_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#49645415132_61e5664ecf_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#IMG_20210521_232406.jpg
+DIS5K/DIS-TR/im/7#Electrical#10#UtilityPole#3298312021_92f431e3e9_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#10#UtilityPole#47950134773_fbfff63f4e_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#11#VacuumCleaner#5448403677_6a29e21881_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#2#CeilingLamp#611568868_680ed5d39f_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#3#Fan#3391683115_990525a693_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#6#StreetLamp#150049122_0692266618_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#9#TransmissionTower#31433908671_7e7e277dfe_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#1#Antenna#8727884873_e0622ee5c4_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#2#Camcorder#4172690390_7e5f280ace_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#3#Earphone#413984555_f290febdf5_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#5#Headset#30574225373_3717ed9fa4_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#6#Microphone#538006482_4aae4f5bd6_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#9#MusicPlayer#1306012480_2ea80d2afd_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#1#GymEquipment#33071754135_8f3195cbd1_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#2#KidsPlayground#2305807849_be53d724ea_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#2#KidsPlayground#3862040422_5bbf903204_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#3#OutdoorFitnessEquipment#10814507005_3dacaa28b3_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#4#FerrisWheel#81640293_4b0ee62040_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#5#Swing#49867339188_08073f4b76_o.jpg
+DIS5K/DIS-VD/im/1#Accessories#1#Bag#6815402415_e01c1a41e6_o.jpg
+DIS5K/DIS-VD/im/1#Accessories#5#Jewelry#2744070193_1486582e8d_o.jpg
+DIS5K/DIS-VD/im/10#Frame#1#BasketballHoop#IMG_20210521_232650.jpg
+DIS5K/DIS-VD/im/10#Frame#5#Rack#6156611713_49ebf12b1e_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#11#Handrail#3276641240_1b84b5af85_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#13#Ladder#33423266_5391cf47e9_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#17#Table#3725111755_4fc101e7ab_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#2#Bench#35556410400_7235b58070_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#4#Chair#3301769985_e49de6739f_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#6#DentalChair#23811071619_2a95c3a688_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#9#Easel#8322807354_df6d56542e_o.jpg
+DIS5K/DIS-VD/im/13#Insect#10#Mosquito#12391674863_0cdf430d3f_o.jpg
+DIS5K/DIS-VD/im/13#Insect#7#Dragonfly#14693028899_344ea118f2_o.jpg
+DIS5K/DIS-VD/im/14#Kitchenware#10#WineGlass#4450148455_8f460f541a_o.jpg
+DIS5K/DIS-VD/im/14#Kitchenware#3#Hydrovalve#IMG_20210520_203410.jpg
+DIS5K/DIS-VD/im/15#Machine#3#PlowHarrow#34521712846_df4babb024_o.jpg
+DIS5K/DIS-VD/im/16#Music Instrument#5#Trombone#6222242743_e7189405cd_o.jpg
+DIS5K/DIS-VD/im/17#Non-motor Vehicle#12#Wheel#25677578797_ea47e1d9e8_o.jpg
+DIS5K/DIS-VD/im/17#Non-motor Vehicle#2#Bicycle#5153474856_21560b081b_o.jpg
+DIS5K/DIS-VD/im/17#Non-motor Vehicle#7#Mower#16992510572_8a6ff27398_o.jpg
+DIS5K/DIS-VD/im/19#Ship#2#Canoe#40571458163_7faf8b73d9_o.jpg
+DIS5K/DIS-VD/im/2#Aircraft#1#Airplane#4270588164_66a619e834_o.jpg
+DIS5K/DIS-VD/im/2#Aircraft#4#Helicopter#86789665_650b94b2ee_o.jpg
+DIS5K/DIS-VD/im/20#Sports#14#Wakesurfing#5589577652_5061c168d2_o.jpg
+DIS5K/DIS-VD/im/21#Tool#10#Spade#37018312543_63b21b0784_o.jpg
+DIS5K/DIS-VD/im/21#Tool#14#Sword#24789047250_42df9bf422_o.jpg
+DIS5K/DIS-VD/im/21#Tool#18#Umbrella#IMG_20210513_140445.jpg
+DIS5K/DIS-VD/im/21#Tool#6#Key#43939732715_5a6e28b518_o.jpg
+DIS5K/DIS-VD/im/22#Weapon#1#Cannon#12758066705_90b54295e7_o.jpg
+DIS5K/DIS-VD/im/22#Weapon#4#Rifle#8019368790_fb6dc469a7_o.jpg
+DIS5K/DIS-VD/im/3#Aquatic#5#Shrimp#2582833427_7a99e7356e_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#12#Scaffold#1013402687_590750354e_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#13#Sculpture#17176841759_272a3ed6e3_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#14#Stair#15079108505_0d11281624_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#19#Windmill#2928111082_ceb3051c04_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#3#Crack#3551574032_17dd106d31_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#5#GasStation#4564307581_c3069bdc62_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#8#ObservationTower#2704526950_d4f0ddc807_o.jpg
+DIS5K/DIS-VD/im/5#Artifact#3#Handcraft#10873642323_1bafce3aa5_o.jpg
+DIS5K/DIS-VD/im/6#Automobile#11#Tractor#8594504006_0c2c557d85_o.jpg
+DIS5K/DIS-VD/im/8#Electronics#3#Earphone#8106454803_1178d867cc_o.jpg

tools/DepthPro/src/depth_pro/network/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024 Apple Inc. All Rights Reserved.
2	+ """Depth Pro network blocks."""

tools/DepthPro/src/depth_pro/network/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (197 Bytes). View file

tools/DepthPro/src/depth_pro/network/__pycache__/decoder.cpython-310.pyc ADDED Viewed

Binary file (5.32 kB). View file

tools/DepthPro/src/depth_pro/network/__pycache__/encoder.cpython-310.pyc ADDED Viewed

Binary file (7.43 kB). View file

tools/DepthPro/src/depth_pro/network/__pycache__/fov.cpython-310.pyc ADDED Viewed

Binary file (2.09 kB). View file

tools/DepthPro/src/depth_pro/network/__pycache__/vit.cpython-310.pyc ADDED Viewed

Binary file (2.81 kB). View file

tools/DepthPro/src/depth_pro/network/__pycache__/vit_factory.cpython-310.pyc ADDED Viewed

Binary file (2.96 kB). View file

tools/DepthPro/src/depth_pro/network/decoder.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""Copyright (C) 2024 Apple Inc. All Rights Reserved.
+Dense Prediction Transformer Decoder architecture.
+Implements a variant of Vision Transformers for Dense Prediction, https://arxiv.org/abs/2103.13413
+"""
+from __future__ import annotations
+from typing import Iterable
+import torch
+from torch import nn
+class MultiresConvDecoder(nn.Module):
+    """Decoder for multi-resolution encodings."""
+    def __init__(
+        self,
+        dims_encoder: Iterable[int],
+        dim_decoder: int,
+    ):
+        """Initialize multiresolution convolutional decoder.
+        Args:
+        ----
+            dims_encoder: Expected dims at each level from the encoder.
+            dim_decoder: Dim of decoder features.
+        """
+        super().__init__()
+        self.dims_encoder = list(dims_encoder)
+        self.dim_decoder = dim_decoder
+        self.dim_out = dim_decoder
+        num_encoders = len(self.dims_encoder)
+        # At the highest resolution, i.e. level 0, we apply projection w/ 1x1 convolution
+        # when the dimensions mismatch. Otherwise we do not do anything, which is
+        # the default behavior of monodepth.
+        conv0 = (
+            nn.Conv2d(self.dims_encoder[0], dim_decoder, kernel_size=1, bias=False)
+            if self.dims_encoder[0] != dim_decoder
+            else nn.Identity()
+        )
+        convs = [conv0]
+        for i in range(1, num_encoders):
+            convs.append(
+                nn.Conv2d(
+                    self.dims_encoder[i],
+                    dim_decoder,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                )
+            )
+        self.convs = nn.ModuleList(convs)
+        fusions = []
+        for i in range(num_encoders):
+            fusions.append(
+                FeatureFusionBlock2d(
+                    num_features=dim_decoder,
+                    deconv=(i != 0),
+                    batch_norm=False,
+                )
+            )
+        self.fusions = nn.ModuleList(fusions)
+    def forward(self, encodings: torch.Tensor) -> torch.Tensor:
+        """Decode the multi-resolution encodings."""
+        num_levels = len(encodings)
+        num_encoders = len(self.dims_encoder)
+        if num_levels != num_encoders:
+            raise ValueError(
+                f"Got encoder output levels={num_levels}, expected levels={num_encoders+1}."
+            )
+        # Project features of different encoder dims to the same decoder dim.
+        # Fuse features from the lowest resolution (num_levels-1)
+        # to the highest (0).
+        features = self.convs[-1](encodings[-1])
+        lowres_features = features
+        features = self.fusions[-1](features)
+        for i in range(num_levels - 2, -1, -1):
+            features_i = self.convs[i](encodings[i])
+            features = self.fusions[i](features, features_i)
+        return features, lowres_features
+class ResidualBlock(nn.Module):
+    """Generic implementation of residual blocks.
+    This implements a generic residual block from
+        He et al. - Identity Mappings in Deep Residual Networks (2016),
+        https://arxiv.org/abs/1603.05027
+    which can be further customized via factory functions.
+    """
+    def __init__(self, residual: nn.Module, shortcut: nn.Module | None = None) -> None:
+        """Initialize ResidualBlock."""
+        super().__init__()
+        self.residual = residual
+        self.shortcut = shortcut
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply residual block."""
+        delta_x = self.residual(x)
+        if self.shortcut is not None:
+            x = self.shortcut(x)
+        return x + delta_x
+class FeatureFusionBlock2d(nn.Module):
+    """Feature fusion for DPT."""
+    def __init__(
+        self,
+        num_features: int,
+        deconv: bool = False,
+        batch_norm: bool = False,
+    ):
+        """Initialize feature fusion block.
+        Args:
+        ----
+            num_features: Input and output dimensions.
+            deconv: Whether to use deconv before the final output conv.
+            batch_norm: Whether to use batch normalization in resnet blocks.
+        """
+        super().__init__()
+        self.resnet1 = self._residual_block(num_features, batch_norm)
+        self.resnet2 = self._residual_block(num_features, batch_norm)
+        self.use_deconv = deconv
+        if deconv:
+            self.deconv = nn.ConvTranspose2d(
+                in_channels=num_features,
+                out_channels=num_features,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=False,
+            )
+        self.out_conv = nn.Conv2d(
+            num_features,
+            num_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x0: torch.Tensor, x1: torch.Tensor | None = None) -> torch.Tensor:
+        """Process and fuse input features."""
+        x = x0
+        if x1 is not None:
+            res = self.resnet1(x1)
+            x = self.skip_add.add(x, res)
+        x = self.resnet2(x)
+        if self.use_deconv:
+            x = self.deconv(x)
+        x = self.out_conv(x)
+        return x
+    @staticmethod
+    def _residual_block(num_features: int, batch_norm: bool):
+        """Create a residual block."""
+        def _create_block(dim: int, batch_norm: bool) -> list[nn.Module]:
+            layers = [
+                nn.ReLU(False),
+                nn.Conv2d(
+                    num_features,
+                    num_features,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not batch_norm,
+                ),
+            ]
+            if batch_norm:
+                layers.append(nn.BatchNorm2d(dim))
+            return layers
+        residual = nn.Sequential(
+            *_create_block(dim=num_features, batch_norm=batch_norm),
+            *_create_block(dim=num_features, batch_norm=batch_norm),
+        )
+        return ResidualBlock(residual)

tools/DepthPro/src/depth_pro/network/encoder.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# DepthProEncoder combining patch and image encoders.
+from __future__ import annotations
+import math
+from typing import Iterable, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class DepthProEncoder(nn.Module):
+    """DepthPro Encoder.
+    An encoder aimed at creating multi-resolution encodings from Vision Transformers.
+    """
+    def __init__(
+        self,
+        dims_encoder: Iterable[int],
+        patch_encoder: nn.Module,
+        image_encoder: nn.Module,
+        hook_block_ids: Iterable[int],
+        decoder_features: int,
+    ):
+        """Initialize DepthProEncoder.
+        The framework
+            1. creates an image pyramid,
+            2. generates overlapping patches with a sliding window at each pyramid level,
+            3. creates batched encodings via vision transformer backbones,
+            4. produces multi-resolution encodings.
+        Args:
+        ----
+            img_size: Backbone image resolution.
+            dims_encoder: Dimensions of the encoder at different layers.
+            patch_encoder: Backbone used for patches.
+            image_encoder: Backbone used for global image encoder.
+            hook_block_ids: Hooks to obtain intermediate features for the patch encoder model.
+            decoder_features: Number of feature output in the decoder.
+        """
+        super().__init__()
+        self.dims_encoder = list(dims_encoder)
+        self.patch_encoder = patch_encoder
+        self.image_encoder = image_encoder
+        self.hook_block_ids = list(hook_block_ids)
+        patch_encoder_embed_dim = patch_encoder.embed_dim
+        image_encoder_embed_dim = image_encoder.embed_dim
+        self.out_size = int(
+            patch_encoder.patch_embed.img_size[0] // patch_encoder.patch_embed.patch_size[0]
+        )
+        def _create_project_upsample_block(
+            dim_in: int,
+            dim_out: int,
+            upsample_layers: int,
+            dim_int: Optional[int] = None,
+        ) -> nn.Module:
+            if dim_int is None:
+                dim_int = dim_out
+            # Projection.
+            blocks = [
+                nn.Conv2d(
+                    in_channels=dim_in,
+                    out_channels=dim_int,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias=False,
+                )
+            ]
+            # Upsampling.
+            blocks += [
+                nn.ConvTranspose2d(
+                    in_channels=dim_int if i == 0 else dim_out,
+                    out_channels=dim_out,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    bias=False,
+                )
+                for i in range(upsample_layers)
+            ]
+            return nn.Sequential(*blocks)
+        self.upsample_latent0 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim,
+            dim_int=self.dims_encoder[0],
+            dim_out=decoder_features,
+            upsample_layers=3,
+        )
+        self.upsample_latent1 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[0], upsample_layers=2
+        )
+        self.upsample0 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[1], upsample_layers=1
+        )
+        self.upsample1 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[2], upsample_layers=1
+        )
+        self.upsample2 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[3], upsample_layers=1
+        )
+        self.upsample_lowres = nn.ConvTranspose2d(
+            in_channels=image_encoder_embed_dim,
+            out_channels=self.dims_encoder[3],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+        )
+        self.fuse_lowres = nn.Conv2d(
+            in_channels=(self.dims_encoder[3] + self.dims_encoder[3]),
+            out_channels=self.dims_encoder[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        # Obtain intermediate outputs of the blocks.
+        self.patch_encoder.blocks[self.hook_block_ids[0]].register_forward_hook(
+            self._hook0
+        )
+        self.patch_encoder.blocks[self.hook_block_ids[1]].register_forward_hook(
+            self._hook1
+        )
+    def _hook0(self, model, input, output):
+        self.backbone_highres_hook0 = output
+    def _hook1(self, model, input, output):
+        self.backbone_highres_hook1 = output
+    @property
+    def img_size(self) -> int:
+        """Return the full image size of the SPN network."""
+        return self.patch_encoder.patch_embed.img_size[0] * 4
+    def _create_pyramid(
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Create a 3-level image pyramid."""
+        # Original resolution: 1536 by default.
+        x0 = x
+        # Middle resolution: 768 by default.
+        x1 = F.interpolate(
+            x, size=None, scale_factor=0.5, mode="bilinear", align_corners=False
+        )
+        # Low resolution: 384 by default, corresponding to the backbone resolution.
+        x2 = F.interpolate(
+            x, size=None, scale_factor=0.25, mode="bilinear", align_corners=False
+        )
+        return x0, x1, x2
+    def split(self, x: torch.Tensor, overlap_ratio: float = 0.25) -> torch.Tensor:
+        """Split the input into small patches with sliding window."""
+        patch_size = 384
+        patch_stride = int(patch_size * (1 - overlap_ratio))
+        image_size = x.shape[-1]
+        steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1
+        x_patch_list = []
+        for j in range(steps):
+            j0 = j * patch_stride
+            j1 = j0 + patch_size
+            for i in range(steps):
+                i0 = i * patch_stride
+                i1 = i0 + patch_size
+                x_patch_list.append(x[..., j0:j1, i0:i1])
+        return torch.cat(x_patch_list, dim=0)
+    def merge(self, x: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor:
+        """Merge the patched input into a image with sliding window."""
+        steps = int(math.sqrt(x.shape[0] // batch_size))
+        idx = 0
+        output_list = []
+        for j in range(steps):
+            output_row_list = []
+            for i in range(steps):
+                output = x[batch_size * idx : batch_size * (idx + 1)]
+                if j != 0:
+                    output = output[..., padding:, :]
+                if i != 0:
+                    output = output[..., :, padding:]
+                if j != steps - 1:
+                    output = output[..., :-padding, :]
+                if i != steps - 1:
+                    output = output[..., :, :-padding]
+                output_row_list.append(output)
+                idx += 1
+            output_row = torch.cat(output_row_list, dim=-1)
+            output_list.append(output_row)
+        output = torch.cat(output_list, dim=-2)
+        return output
+    def reshape_feature(
+        self, embeddings: torch.Tensor, width, height, cls_token_offset=1
+    ):
+        """Discard class token and reshape 1D feature map to a 2D grid."""
+        b, hw, c = embeddings.shape
+        # Remove class token.
+        if cls_token_offset > 0:
+            embeddings = embeddings[:, cls_token_offset:, :]
+        # Shape: (batch, height, width, dim) -> (batch, dim, height, width)
+        embeddings = embeddings.reshape(b, height, width, c).permute(0, 3, 1, 2)
+        return embeddings
+    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
+        """Encode input at multiple resolutions.
+        Args:
+        ----
+            x (torch.Tensor): Input image.
+        Returns:
+        -------
+            Multi resolution encoded features.
+        """
+        batch_size = x.shape[0]
+        # Step 0: create a 3-level image pyramid.
+        x0, x1, x2 = self._create_pyramid(x)
+        # Step 1: split to create batched overlapped mini-images at the backbone (BeiT/ViT/Dino)
+        # resolution.
+        # 5x5 @ 384x384 at the highest resolution (1536x1536).
+        x0_patches = self.split(x0, overlap_ratio=0.25)
+        # 3x3 @ 384x384 at the middle resolution (768x768).
+        x1_patches = self.split(x1, overlap_ratio=0.5)
+        # 1x1 # 384x384 at the lowest resolution (384x384).
+        x2_patches = x2
+        # Concatenate all the sliding window patches and form a batch of size (35=5x5+3x3+1x1).
+        x_pyramid_patches = torch.cat(
+            (x0_patches, x1_patches, x2_patches),
+            dim=0,
+        )
+        # Step 2: Run the backbone (BeiT) model and get the result of large batch size.
+        x_pyramid_encodings = self.patch_encoder(x_pyramid_patches)
+        x_pyramid_encodings = self.reshape_feature(
+            x_pyramid_encodings, self.out_size, self.out_size
+        )
+        # Step 3: merging.
+        # Merge highres latent encoding.
+        x_latent0_encodings = self.reshape_feature(
+            self.backbone_highres_hook0,
+            self.out_size,
+            self.out_size,
+        )
+        x_latent0_features = self.merge(
+            x_latent0_encodings[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+        )
+        x_latent1_encodings = self.reshape_feature(
+            self.backbone_highres_hook1,
+            self.out_size,
+            self.out_size,
+        )
+        x_latent1_features = self.merge(
+            x_latent1_encodings[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+        )
+        # Split the 35 batch size from pyramid encoding back into 5x5+3x3+1x1.
+        x0_encodings, x1_encodings, x2_encodings = torch.split(
+            x_pyramid_encodings,
+            [len(x0_patches), len(x1_patches), len(x2_patches)],
+            dim=0,
+        )
+        # 96x96 feature maps by merging 5x5 @ 24x24 patches with overlaps.
+        x0_features = self.merge(x0_encodings, batch_size=batch_size, padding=3)
+        # 48x84 feature maps by merging 3x3 @ 24x24 patches with overlaps.
+        x1_features = self.merge(x1_encodings, batch_size=batch_size, padding=6)
+        # 24x24 feature maps.
+        x2_features = x2_encodings
+        # Apply the image encoder model.
+        x_global_features = self.image_encoder(x2_patches)
+        x_global_features = self.reshape_feature(
+            x_global_features, self.out_size, self.out_size
+        )
+        # Upsample feature maps.
+        x_latent0_features = self.upsample_latent0(x_latent0_features)
+        x_latent1_features = self.upsample_latent1(x_latent1_features)
+        x0_features = self.upsample0(x0_features)
+        x1_features = self.upsample1(x1_features)
+        x2_features = self.upsample2(x2_features)
+        x_global_features = self.upsample_lowres(x_global_features)
+        x_global_features = self.fuse_lowres(
+            torch.cat((x2_features, x_global_features), dim=1)
+        )
+        return [
+            x_latent0_features,
+            x_latent1_features,
+            x0_features,
+            x1_features,
+            x_global_features,
+        ]

tools/DepthPro/src/depth_pro/network/fov.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# Field of View network architecture.
+from typing import Optional
+import torch
+from torch import nn
+from torch.nn import functional as F
+class FOVNetwork(nn.Module):
+    """Field of View estimation network."""
+    def __init__(
+        self,
+        num_features: int,
+        fov_encoder: Optional[nn.Module] = None,
+    ):
+        """Initialize the Field of View estimation block.
+        Args:
+        ----
+            num_features: Number of features used.
+            fov_encoder: Optional encoder to bring additional network capacity.
+        """
+        super().__init__()
+        # Create FOV head.
+        fov_head0 = [
+            nn.Conv2d(
+                num_features, num_features // 2, kernel_size=3, stride=2, padding=1
+            ),  # 128 x 24 x 24
+            nn.ReLU(True),
+        ]
+        fov_head = [
+            nn.Conv2d(
+                num_features // 2, num_features // 4, kernel_size=3, stride=2, padding=1
+            ),  # 64 x 12 x 12
+            nn.ReLU(True),
+            nn.Conv2d(
+                num_features // 4, num_features // 8, kernel_size=3, stride=2, padding=1
+            ),  # 32 x 6 x 6
+            nn.ReLU(True),
+            nn.Conv2d(num_features // 8, 1, kernel_size=6, stride=1, padding=0),
+        ]
+        if fov_encoder is not None:
+            self.encoder = nn.Sequential(
+                fov_encoder, nn.Linear(fov_encoder.embed_dim, num_features // 2)
+            )
+            self.downsample = nn.Sequential(*fov_head0)
+        else:
+            fov_head = fov_head0 + fov_head
+        self.head = nn.Sequential(*fov_head)
+    def forward(self, x: torch.Tensor, lowres_feature: torch.Tensor) -> torch.Tensor:
+        """Forward the fov network.
+        Args:
+        ----
+            x (torch.Tensor): Input image.
+            lowres_feature (torch.Tensor): Low resolution feature.
+        Returns:
+        -------
+            The field of view tensor.
+        """
+        if hasattr(self, "encoder"):
+            x = F.interpolate(
+                x,
+                size=None,
+                scale_factor=0.25,
+                mode="bilinear",
+                align_corners=False,
+            )
+            x = self.encoder(x)[:, 1:].permute(0, 2, 1)
+            lowres_feature = self.downsample(lowres_feature)
+            x = x.reshape_as(lowres_feature) + lowres_feature
+        else:
+            x = lowres_feature
+        return self.head(x)