Spaces:
Running
Running
File size: 8,112 Bytes
56cd6b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# Copyright (C) 2024-present Naver Corporation. All rights reserved.
# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
#
# --------------------------------------------------------
# DUSt3R model class
# --------------------------------------------------------
from copy import deepcopy
import torch
from .utils.misc import fill_default_args, freeze_all_params, is_symmetrized, interleave, transpose_to_landscape
from .heads import head_factory
from dust3r.patch_embed import get_patch_embed
import dust3r.utils.path_to_croco # noqa: F401
from croco.models.croco import CroCoNet # noqa
inf = float('inf')
class AsymmetricCroCo3DStereo (CroCoNet):
""" Two siamese encoders, followed by two decoders.
The goal is to output 3d points directly, both images in view1's frame
(hence the asymmetry).
"""
def __init__(self,
output_mode='pts3d',
head_type='linear',
depth_mode=('exp', -inf, inf),
conf_mode=('exp', 1, inf),
freeze='none',
landscape_only=True,
patch_embed_cls='PatchEmbedDust3R', # PatchEmbedDust3R or ManyAR_PatchEmbed
**croco_kwargs):
self.patch_embed_cls = patch_embed_cls
self.croco_args = fill_default_args(croco_kwargs, super().__init__)
super().__init__(**croco_kwargs)
# dust3rWithSam2 specific initialization
self.dec_blocks2 = deepcopy(self.dec_blocks)
self.set_downstream_head(output_mode, head_type, landscape_only, depth_mode, conf_mode, **croco_kwargs)
self.set_freeze(freeze)
def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
self.patch_embed = get_patch_embed(self.patch_embed_cls, img_size, patch_size, enc_embed_dim)
def load_state_dict(self, ckpt, **kw):
# duplicate all weights for the second decoder if not present
new_ckpt = dict(ckpt)
if not any(k.startswith('dec_blocks2') for k in ckpt):
for key, value in ckpt.items():
if key.startswith('dec_blocks'):
new_ckpt[key.replace('dec_blocks', 'dec_blocks2')] = value
return super().load_state_dict(new_ckpt, **kw)
def set_freeze(self, freeze): # this is for use by downstream models
self.freeze = freeze
to_be_frozen = {
'none': [],
'mask': [self.mask_token],
'encoder': [self.mask_token, self.patch_embed, self.enc_blocks],
}
freeze_all_params(to_be_frozen[freeze])
def _set_prediction_head(self, *args, **kwargs):
""" No prediction head """
return
def set_downstream_head(self, output_mode, head_type, landscape_only, depth_mode, conf_mode, patch_size, img_size,
**kw):
assert img_size[0] % patch_size == 0 and img_size[1] % patch_size == 0, \
f'{img_size=} must be multiple of {patch_size=}'
self.output_mode = output_mode
self.head_type = head_type
self.depth_mode = depth_mode
self.conf_mode = conf_mode
# allocate heads
self.downstream_head1 = head_factory(head_type, output_mode, self, has_conf=bool(conf_mode))
self.downstream_head2 = head_factory(head_type, output_mode, self, has_conf=bool(conf_mode))
# magic wrapper
self.head1 = transpose_to_landscape(self.downstream_head1, activate=landscape_only)
self.head2 = transpose_to_landscape(self.downstream_head2, activate=landscape_only)
def _encode_image(self, image, true_shape): # image:输入的两张图片在batch维度上连接后的结果
# embed the image into patches (x has size B x Npatches x C)
x, pos = self.patch_embed(image, true_shape=true_shape) # 调用PatchEmbedDust3R,进行patch_embedding和位置编码
# add positional embedding without cls token
assert self.enc_pos_embed is None
# now apply the transformer encoder and normalization
for blk in self.enc_blocks: # 一共有24层block的encoder
x = blk(x, pos)
x = self.enc_norm(x) # LayerNorm
return x, pos, None
def _encode_image_pairs(self, img1, img2, true_shape1, true_shape2):
if img1.shape[-2:] == img2.shape[-2:]:
out, pos, _ = self._encode_image(torch.cat((img1, img2), dim=0), # 将两张图片在batch维度上连接
torch.cat((true_shape1, true_shape2), dim=0))
out, out2 = out.chunk(2, dim=0)
pos, pos2 = pos.chunk(2, dim=0)
else: #******************************* 输入ViT encoder ************************
out, pos, _ = self._encode_image(img1, true_shape1)
out2, pos2, _ = self._encode_image(img2, true_shape2)
return out, out2, pos, pos2
def _encode_symmetrized(self, view1, view2):
img1 = view1['img']
img2 = view2['img']
B = img1.shape[0]
# Recover true_shape when available, otherwise assume that the img shape is the true one
shape1 = view1.get('true_shape', torch.tensor(img1.shape[-2:])[None].repeat(B, 1))
shape2 = view2.get('true_shape', torch.tensor(img2.shape[-2:])[None].repeat(B, 1))
# warning! maybe the images have different portrait/landscape orientations
if is_symmetrized(view1, view2):
# computing half of forward pass!'
feat1, feat2, pos1, pos2 = self._encode_image_pairs(img1[::2], img2[::2], shape1[::2], shape2[::2])
feat1, feat2 = interleave(feat1, feat2)
pos1, pos2 = interleave(pos1, pos2)
else: #******************************* 输入ViT encoder ************************
feat1, feat2, pos1, pos2 = self._encode_image_pairs(img1, img2, shape1, shape2)
return (shape1, shape2), (feat1, feat2), (pos1, pos2)
def _decoder(self, f1, pos1, f2, pos2):
final_output = [(f1, f2)] # 来自encoder的两个编码 # 映射前的两个编码
# project to decoder dim # 一个Linear映射层
f1 = self.decoder_embed(f1) # Linear层,channel:1024->768
f2 = self.decoder_embed(f2)
final_output.append((f1, f2)) # 映射后的两个编码
for blk1, blk2 in zip(self.dec_blocks, self.dec_blocks2): #dec_blocks2是由dec_blocks deepcopy过来的,所以是一样的
# img1 side,*final_output[-1][::+1]表示输入f1,f2
f1, _ = blk1(*final_output[-1][::+1], pos1, pos2)
# img2 side *final_output[-1][::-1]表示输入f2,f1
f2, _ = blk2(*final_output[-1][::-1], pos2, pos1)
# store the result
final_output.append((f1, f2))
# normalize last output
del final_output[1] # duplicate with final_output[0],即删除 映射后的两个编码
final_output[-1] = tuple(map(self.dec_norm, final_output[-1]))
return zip(*final_output)
def _downstream_head(self, head_num, decout, img_shape):
B, S, D = decout[-1].shape
# img_shape = tuple(map(int, img_shape))
head = getattr(self, f'head{head_num}')
return head(decout, img_shape)
def forward(self, view1, view2):
# *****encode the two images --> B,S,D ** 输入ViT encoder ************************
(shape1, shape2), (feat1, feat2), (pos1, pos2) = self._encode_symmetrized(view1, view2)
# combine all ref images into object-centric representation **输入decoder*************
dec1, dec2 = self._decoder(feat1, pos1, feat2, pos2)
with torch.cuda.amp.autocast(enabled=False): #Decoder的结果分别输入 Head1 和 Head2
res1 = self._downstream_head(1, [tok.float() for tok in dec1], shape1) # PixelwiseTaskWithDPT
res2 = self._downstream_head(2, [tok.float() for tok in dec2], shape2)
res2['pts3d_in_other_view'] = res2.pop('pts3d') # predict view2's pts3d in view1's frame,即res2中的三维点云坐标是在view1的相机坐标系下的
return res1, res2
|