|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
from mmcv.cnn import Conv2d, Linear, build_activation_layer |
|
from mmcv.runner import force_fp32 |
|
|
|
from mmdet.core import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, |
|
build_assigner, build_sampler, multi_apply, |
|
reduce_mean) |
|
from mmdet.models.utils import (FFN, build_positional_encoding, |
|
build_transformer) |
|
from ..builder import HEADS, build_loss |
|
from .anchor_free_head import AnchorFreeHead |
|
|
|
|
|
@HEADS.register_module() |
|
class TransformerHead(AnchorFreeHead): |
|
"""Implements the DETR transformer head. |
|
|
|
See `paper: End-to-End Object Detection with Transformers |
|
<https://arxiv.org/pdf/2005.12872>`_ for details. |
|
|
|
Args: |
|
num_classes (int): Number of categories excluding the background. |
|
in_channels (int): Number of channels in the input feature map. |
|
num_fcs (int, optional): Number of fully-connected layers used in |
|
`FFN`, which is then used for the regression head. Default 2. |
|
transformer (dict, optional): Config for transformer. |
|
positional_encoding (dict, optional): Config for position encoding. |
|
loss_cls (dict, optional): Config of the classification loss. |
|
Default `CrossEntropyLoss`. |
|
loss_bbox (dict, optional): Config of the regression loss. |
|
Default `L1Loss`. |
|
loss_iou (dict, optional): Config of the regression iou loss. |
|
Default `GIoULoss`. |
|
tran_cfg (dict, optional): Training config of transformer head. |
|
test_cfg (dict, optional): Testing config of transformer head. |
|
|
|
Example: |
|
>>> import torch |
|
>>> self = TransformerHead(80, 2048) |
|
>>> x = torch.rand(1, 2048, 32, 32) |
|
>>> mask = torch.ones(1, 32, 32).to(x.dtype) |
|
>>> mask[:, :16, :15] = 0 |
|
>>> all_cls_scores, all_bbox_preds = self(x, mask) |
|
""" |
|
|
|
def __init__(self, |
|
num_classes, |
|
in_channels, |
|
num_fcs=2, |
|
transformer=dict( |
|
type='Transformer', |
|
embed_dims=256, |
|
num_heads=8, |
|
num_encoder_layers=6, |
|
num_decoder_layers=6, |
|
feedforward_channels=2048, |
|
dropout=0.1, |
|
act_cfg=dict(type='ReLU', inplace=True), |
|
norm_cfg=dict(type='LN'), |
|
num_fcs=2, |
|
pre_norm=False, |
|
return_intermediate_dec=True), |
|
positional_encoding=dict( |
|
type='SinePositionalEncoding', |
|
num_feats=128, |
|
normalize=True), |
|
loss_cls=dict( |
|
type='CrossEntropyLoss', |
|
bg_cls_weight=0.1, |
|
use_sigmoid=False, |
|
loss_weight=1.0, |
|
class_weight=1.0), |
|
loss_bbox=dict(type='L1Loss', loss_weight=5.0), |
|
loss_iou=dict(type='GIoULoss', loss_weight=2.0), |
|
train_cfg=dict( |
|
assigner=dict( |
|
type='HungarianAssigner', |
|
cls_cost=dict(type='ClassificationCost', weight=1.), |
|
reg_cost=dict(type='BBoxL1Cost', weight=5.0), |
|
iou_cost=dict( |
|
type='IoUCost', iou_mode='giou', weight=2.0))), |
|
test_cfg=dict(max_per_img=100), |
|
**kwargs): |
|
|
|
|
|
|
|
super(AnchorFreeHead, self).__init__() |
|
use_sigmoid_cls = loss_cls.get('use_sigmoid', False) |
|
assert not use_sigmoid_cls, 'setting use_sigmoid_cls as True is ' \ |
|
'not supported in DETR, since background is needed for the ' \ |
|
'matching process.' |
|
assert 'embed_dims' in transformer \ |
|
and 'num_feats' in positional_encoding |
|
num_feats = positional_encoding['num_feats'] |
|
embed_dims = transformer['embed_dims'] |
|
assert num_feats * 2 == embed_dims, 'embed_dims should' \ |
|
f' be exactly 2 times of num_feats. Found {embed_dims}' \ |
|
f' and {num_feats}.' |
|
assert test_cfg is not None and 'max_per_img' in test_cfg |
|
|
|
class_weight = loss_cls.get('class_weight', None) |
|
if class_weight is not None: |
|
assert isinstance(class_weight, float), 'Expected ' \ |
|
'class_weight to have type float. Found ' \ |
|
f'{type(class_weight)}.' |
|
|
|
|
|
bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight) |
|
assert isinstance(bg_cls_weight, float), 'Expected ' \ |
|
'bg_cls_weight to have type float. Found ' \ |
|
f'{type(bg_cls_weight)}.' |
|
class_weight = torch.ones(num_classes + 1) * class_weight |
|
|
|
class_weight[num_classes] = bg_cls_weight |
|
loss_cls.update({'class_weight': class_weight}) |
|
if 'bg_cls_weight' in loss_cls: |
|
loss_cls.pop('bg_cls_weight') |
|
self.bg_cls_weight = bg_cls_weight |
|
|
|
if train_cfg: |
|
assert 'assigner' in train_cfg, 'assigner should be provided '\ |
|
'when train_cfg is set.' |
|
assigner = train_cfg['assigner'] |
|
assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \ |
|
'The classification weight for loss and matcher should be' \ |
|
'exactly the same.' |
|
assert loss_bbox['loss_weight'] == assigner['reg_cost'][ |
|
'weight'], 'The regression L1 weight for loss and matcher ' \ |
|
'should be exactly the same.' |
|
assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'], \ |
|
'The regression iou weight for loss and matcher should be' \ |
|
'exactly the same.' |
|
self.assigner = build_assigner(assigner) |
|
|
|
sampler_cfg = dict(type='PseudoSampler') |
|
self.sampler = build_sampler(sampler_cfg, context=self) |
|
self.num_classes = num_classes |
|
self.cls_out_channels = num_classes + 1 |
|
self.in_channels = in_channels |
|
self.num_fcs = num_fcs |
|
self.train_cfg = train_cfg |
|
self.test_cfg = test_cfg |
|
self.use_sigmoid_cls = use_sigmoid_cls |
|
self.embed_dims = embed_dims |
|
self.num_query = test_cfg['max_per_img'] |
|
self.fp16_enabled = False |
|
self.loss_cls = build_loss(loss_cls) |
|
self.loss_bbox = build_loss(loss_bbox) |
|
self.loss_iou = build_loss(loss_iou) |
|
self.act_cfg = transformer.get('act_cfg', |
|
dict(type='ReLU', inplace=True)) |
|
self.activate = build_activation_layer(self.act_cfg) |
|
self.positional_encoding = build_positional_encoding( |
|
positional_encoding) |
|
self.transformer = build_transformer(transformer) |
|
self._init_layers() |
|
|
|
def _init_layers(self): |
|
"""Initialize layers of the transformer head.""" |
|
self.input_proj = Conv2d( |
|
self.in_channels, self.embed_dims, kernel_size=1) |
|
self.fc_cls = Linear(self.embed_dims, self.cls_out_channels) |
|
self.reg_ffn = FFN( |
|
self.embed_dims, |
|
self.embed_dims, |
|
self.num_fcs, |
|
self.act_cfg, |
|
dropout=0.0, |
|
add_residual=False) |
|
self.fc_reg = Linear(self.embed_dims, 4) |
|
self.query_embedding = nn.Embedding(self.num_query, self.embed_dims) |
|
|
|
def init_weights(self, distribution='uniform'): |
|
"""Initialize weights of the transformer head.""" |
|
|
|
self.transformer.init_weights() |
|
|
|
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, |
|
missing_keys, unexpected_keys, error_msgs): |
|
"""load checkpoints.""" |
|
|
|
|
|
|
|
|
|
super(AnchorFreeHead, |
|
self)._load_from_state_dict(state_dict, prefix, local_metadata, |
|
strict, missing_keys, |
|
unexpected_keys, error_msgs) |
|
|
|
def forward(self, feats, img_metas): |
|
"""Forward function. |
|
|
|
Args: |
|
feats (tuple[Tensor]): Features from the upstream network, each is |
|
a 4D-tensor. |
|
img_metas (list[dict]): List of image information. |
|
|
|
Returns: |
|
tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels. |
|
|
|
- all_cls_scores_list (list[Tensor]): Classification scores \ |
|
for each scale level. Each is a 4D-tensor with shape \ |
|
[nb_dec, bs, num_query, cls_out_channels]. Note \ |
|
`cls_out_channels` should includes background. |
|
- all_bbox_preds_list (list[Tensor]): Sigmoid regression \ |
|
outputs for each scale level. Each is a 4D-tensor with \ |
|
normalized coordinate format (cx, cy, w, h) and shape \ |
|
[nb_dec, bs, num_query, 4]. |
|
""" |
|
num_levels = len(feats) |
|
img_metas_list = [img_metas for _ in range(num_levels)] |
|
return multi_apply(self.forward_single, feats, img_metas_list) |
|
|
|
def forward_single(self, x, img_metas): |
|
""""Forward function for a single feature level. |
|
|
|
Args: |
|
x (Tensor): Input feature from backbone's single stage, shape |
|
[bs, c, h, w]. |
|
img_metas (list[dict]): List of image information. |
|
|
|
Returns: |
|
all_cls_scores (Tensor): Outputs from the classification head, |
|
shape [nb_dec, bs, num_query, cls_out_channels]. Note |
|
cls_out_channels should includes background. |
|
all_bbox_preds (Tensor): Sigmoid outputs from the regression |
|
head with normalized coordinate format (cx, cy, w, h). |
|
Shape [nb_dec, bs, num_query, 4]. |
|
""" |
|
|
|
|
|
|
|
batch_size = x.size(0) |
|
input_img_h, input_img_w = img_metas[0]['batch_input_shape'] |
|
masks = x.new_ones((batch_size, input_img_h, input_img_w)) |
|
for img_id in range(batch_size): |
|
img_h, img_w, _ = img_metas[img_id]['img_shape'] |
|
masks[img_id, :img_h, :img_w] = 0 |
|
|
|
x = self.input_proj(x) |
|
|
|
masks = F.interpolate( |
|
masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1) |
|
|
|
pos_embed = self.positional_encoding(masks) |
|
|
|
outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight, |
|
pos_embed) |
|
|
|
all_cls_scores = self.fc_cls(outs_dec) |
|
all_bbox_preds = self.fc_reg(self.activate( |
|
self.reg_ffn(outs_dec))).sigmoid() |
|
return all_cls_scores, all_bbox_preds |
|
|
|
@force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list')) |
|
def loss(self, |
|
all_cls_scores_list, |
|
all_bbox_preds_list, |
|
gt_bboxes_list, |
|
gt_labels_list, |
|
img_metas, |
|
gt_bboxes_ignore=None): |
|
""""Loss function. |
|
|
|
Only outputs from the last feature level are used for computing |
|
losses by default. |
|
|
|
Args: |
|
all_cls_scores_list (list[Tensor]): Classification outputs |
|
for each feature level. Each is a 4D-tensor with shape |
|
[nb_dec, bs, num_query, cls_out_channels]. |
|
all_bbox_preds_list (list[Tensor]): Sigmoid regression |
|
outputs for each feature level. Each is a 4D-tensor with |
|
normalized coordinate format (cx, cy, w, h) and shape |
|
[nb_dec, bs, num_query, 4]. |
|
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image |
|
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. |
|
gt_labels_list (list[Tensor]): Ground truth class indices for each |
|
image with shape (num_gts, ). |
|
img_metas (list[dict]): List of image meta information. |
|
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes |
|
which can be ignored for each image. Default None. |
|
|
|
Returns: |
|
dict[str, Tensor]: A dictionary of loss components. |
|
""" |
|
|
|
all_cls_scores = all_cls_scores_list[-1] |
|
all_bbox_preds = all_bbox_preds_list[-1] |
|
assert gt_bboxes_ignore is None, \ |
|
'Only supports for gt_bboxes_ignore setting to None.' |
|
|
|
num_dec_layers = len(all_cls_scores) |
|
all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] |
|
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] |
|
all_gt_bboxes_ignore_list = [ |
|
gt_bboxes_ignore for _ in range(num_dec_layers) |
|
] |
|
img_metas_list = [img_metas for _ in range(num_dec_layers)] |
|
|
|
losses_cls, losses_bbox, losses_iou = multi_apply( |
|
self.loss_single, all_cls_scores, all_bbox_preds, |
|
all_gt_bboxes_list, all_gt_labels_list, img_metas_list, |
|
all_gt_bboxes_ignore_list) |
|
|
|
loss_dict = dict() |
|
|
|
loss_dict['loss_cls'] = losses_cls[-1] |
|
loss_dict['loss_bbox'] = losses_bbox[-1] |
|
loss_dict['loss_iou'] = losses_iou[-1] |
|
|
|
num_dec_layer = 0 |
|
for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1], |
|
losses_bbox[:-1], |
|
losses_iou[:-1]): |
|
loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i |
|
loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i |
|
loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i |
|
num_dec_layer += 1 |
|
return loss_dict |
|
|
|
def loss_single(self, |
|
cls_scores, |
|
bbox_preds, |
|
gt_bboxes_list, |
|
gt_labels_list, |
|
img_metas, |
|
gt_bboxes_ignore_list=None): |
|
""""Loss function for outputs from a single decoder layer of a single |
|
feature level. |
|
|
|
Args: |
|
cls_scores (Tensor): Box score logits from a single decoder layer |
|
for all images. Shape [bs, num_query, cls_out_channels]. |
|
bbox_preds (Tensor): Sigmoid outputs from a single decoder layer |
|
for all images, with normalized coordinate (cx, cy, w, h) and |
|
shape [bs, num_query, 4]. |
|
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image |
|
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. |
|
gt_labels_list (list[Tensor]): Ground truth class indices for each |
|
image with shape (num_gts, ). |
|
img_metas (list[dict]): List of image meta information. |
|
gt_bboxes_ignore_list (list[Tensor], optional): Bounding |
|
boxes which can be ignored for each image. Default None. |
|
|
|
Returns: |
|
dict[str, Tensor]: A dictionary of loss components for outputs from |
|
a single decoder layer. |
|
""" |
|
num_imgs = cls_scores.size(0) |
|
cls_scores_list = [cls_scores[i] for i in range(num_imgs)] |
|
bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] |
|
cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, |
|
gt_bboxes_list, gt_labels_list, |
|
img_metas, gt_bboxes_ignore_list) |
|
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, |
|
num_total_pos, num_total_neg) = cls_reg_targets |
|
labels = torch.cat(labels_list, 0) |
|
label_weights = torch.cat(label_weights_list, 0) |
|
bbox_targets = torch.cat(bbox_targets_list, 0) |
|
bbox_weights = torch.cat(bbox_weights_list, 0) |
|
|
|
|
|
cls_scores = cls_scores.reshape(-1, self.cls_out_channels) |
|
|
|
cls_avg_factor = num_total_pos * 1.0 + \ |
|
num_total_neg * self.bg_cls_weight |
|
loss_cls = self.loss_cls( |
|
cls_scores, labels, label_weights, avg_factor=cls_avg_factor) |
|
|
|
|
|
|
|
num_total_pos = loss_cls.new_tensor([num_total_pos]) |
|
num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() |
|
|
|
|
|
factors = [] |
|
for img_meta, bbox_pred in zip(img_metas, bbox_preds): |
|
img_h, img_w, _ = img_meta['img_shape'] |
|
factor = bbox_pred.new_tensor([img_w, img_h, img_w, |
|
img_h]).unsqueeze(0).repeat( |
|
bbox_pred.size(0), 1) |
|
factors.append(factor) |
|
factors = torch.cat(factors, 0) |
|
|
|
|
|
|
|
|
|
bbox_preds = bbox_preds.reshape(-1, 4) |
|
bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors |
|
bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors |
|
|
|
|
|
loss_iou = self.loss_iou( |
|
bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) |
|
|
|
|
|
loss_bbox = self.loss_bbox( |
|
bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) |
|
return loss_cls, loss_bbox, loss_iou |
|
|
|
def get_targets(self, |
|
cls_scores_list, |
|
bbox_preds_list, |
|
gt_bboxes_list, |
|
gt_labels_list, |
|
img_metas, |
|
gt_bboxes_ignore_list=None): |
|
""""Compute regression and classification targets for a batch image. |
|
|
|
Outputs from a single decoder layer of a single feature level are used. |
|
|
|
Args: |
|
cls_scores_list (list[Tensor]): Box score logits from a single |
|
decoder layer for each image with shape [num_query, |
|
cls_out_channels]. |
|
bbox_preds_list (list[Tensor]): Sigmoid outputs from a single |
|
decoder layer for each image, with normalized coordinate |
|
(cx, cy, w, h) and shape [num_query, 4]. |
|
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image |
|
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. |
|
gt_labels_list (list[Tensor]): Ground truth class indices for each |
|
image with shape (num_gts, ). |
|
img_metas (list[dict]): List of image meta information. |
|
gt_bboxes_ignore_list (list[Tensor], optional): Bounding |
|
boxes which can be ignored for each image. Default None. |
|
|
|
Returns: |
|
tuple: a tuple containing the following targets. |
|
|
|
- labels_list (list[Tensor]): Labels for all images. |
|
- label_weights_list (list[Tensor]): Label weights for all \ |
|
images. |
|
- bbox_targets_list (list[Tensor]): BBox targets for all \ |
|
images. |
|
- bbox_weights_list (list[Tensor]): BBox weights for all \ |
|
images. |
|
- num_total_pos (int): Number of positive samples in all \ |
|
images. |
|
- num_total_neg (int): Number of negative samples in all \ |
|
images. |
|
""" |
|
assert gt_bboxes_ignore_list is None, \ |
|
'Only supports for gt_bboxes_ignore setting to None.' |
|
num_imgs = len(cls_scores_list) |
|
gt_bboxes_ignore_list = [ |
|
gt_bboxes_ignore_list for _ in range(num_imgs) |
|
] |
|
|
|
(labels_list, label_weights_list, bbox_targets_list, |
|
bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply( |
|
self._get_target_single, cls_scores_list, bbox_preds_list, |
|
gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list) |
|
num_total_pos = sum((inds.numel() for inds in pos_inds_list)) |
|
num_total_neg = sum((inds.numel() for inds in neg_inds_list)) |
|
return (labels_list, label_weights_list, bbox_targets_list, |
|
bbox_weights_list, num_total_pos, num_total_neg) |
|
|
|
def _get_target_single(self, |
|
cls_score, |
|
bbox_pred, |
|
gt_bboxes, |
|
gt_labels, |
|
img_meta, |
|
gt_bboxes_ignore=None): |
|
""""Compute regression and classification targets for one image. |
|
|
|
Outputs from a single decoder layer of a single feature level are used. |
|
|
|
Args: |
|
cls_score (Tensor): Box score logits from a single decoder layer |
|
for one image. Shape [num_query, cls_out_channels]. |
|
bbox_pred (Tensor): Sigmoid outputs from a single decoder layer |
|
for one image, with normalized coordinate (cx, cy, w, h) and |
|
shape [num_query, 4]. |
|
gt_bboxes (Tensor): Ground truth bboxes for one image with |
|
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. |
|
gt_labels (Tensor): Ground truth class indices for one image |
|
with shape (num_gts, ). |
|
img_meta (dict): Meta information for one image. |
|
gt_bboxes_ignore (Tensor, optional): Bounding boxes |
|
which can be ignored. Default None. |
|
|
|
Returns: |
|
tuple[Tensor]: a tuple containing the following for one image. |
|
|
|
- labels (Tensor): Labels of each image. |
|
- label_weights (Tensor]): Label weights of each image. |
|
- bbox_targets (Tensor): BBox targets of each image. |
|
- bbox_weights (Tensor): BBox weights of each image. |
|
- pos_inds (Tensor): Sampled positive indices for each image. |
|
- neg_inds (Tensor): Sampled negative indices for each image. |
|
""" |
|
|
|
num_bboxes = bbox_pred.size(0) |
|
|
|
assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, |
|
gt_labels, img_meta, |
|
gt_bboxes_ignore) |
|
sampling_result = self.sampler.sample(assign_result, bbox_pred, |
|
gt_bboxes) |
|
pos_inds = sampling_result.pos_inds |
|
neg_inds = sampling_result.neg_inds |
|
|
|
|
|
labels = gt_bboxes.new_full((num_bboxes, ), |
|
self.num_classes, |
|
dtype=torch.long) |
|
labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] |
|
label_weights = gt_bboxes.new_ones(num_bboxes) |
|
|
|
|
|
bbox_targets = torch.zeros_like(bbox_pred) |
|
bbox_weights = torch.zeros_like(bbox_pred) |
|
bbox_weights[pos_inds] = 1.0 |
|
img_h, img_w, _ = img_meta['img_shape'] |
|
|
|
|
|
|
|
|
|
factor = bbox_pred.new_tensor([img_w, img_h, img_w, |
|
img_h]).unsqueeze(0) |
|
pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor |
|
pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized) |
|
bbox_targets[pos_inds] = pos_gt_bboxes_targets |
|
return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, |
|
neg_inds) |
|
|
|
|
|
def forward_train(self, |
|
x, |
|
img_metas, |
|
gt_bboxes, |
|
gt_labels=None, |
|
gt_bboxes_ignore=None, |
|
proposal_cfg=None, |
|
**kwargs): |
|
"""Forward function for training mode. |
|
|
|
Args: |
|
x (list[Tensor]): Features from backbone. |
|
img_metas (list[dict]): Meta information of each image, e.g., |
|
image size, scaling factor, etc. |
|
gt_bboxes (Tensor): Ground truth bboxes of the image, |
|
shape (num_gts, 4). |
|
gt_labels (Tensor): Ground truth labels of each box, |
|
shape (num_gts,). |
|
gt_bboxes_ignore (Tensor): Ground truth bboxes to be |
|
ignored, shape (num_ignored_gts, 4). |
|
proposal_cfg (mmcv.Config): Test / postprocessing configuration, |
|
if None, test_cfg would be used. |
|
|
|
Returns: |
|
dict[str, Tensor]: A dictionary of loss components. |
|
""" |
|
assert proposal_cfg is None, '"proposal_cfg" must be None' |
|
outs = self(x, img_metas) |
|
if gt_labels is None: |
|
loss_inputs = outs + (gt_bboxes, img_metas) |
|
else: |
|
loss_inputs = outs + (gt_bboxes, gt_labels, img_metas) |
|
losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) |
|
return losses |
|
|
|
@force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list')) |
|
def get_bboxes(self, |
|
all_cls_scores_list, |
|
all_bbox_preds_list, |
|
img_metas, |
|
rescale=False): |
|
"""Transform network outputs for a batch into bbox predictions. |
|
|
|
Args: |
|
all_cls_scores_list (list[Tensor]): Classification outputs |
|
for each feature level. Each is a 4D-tensor with shape |
|
[nb_dec, bs, num_query, cls_out_channels]. |
|
all_bbox_preds_list (list[Tensor]): Sigmoid regression |
|
outputs for each feature level. Each is a 4D-tensor with |
|
normalized coordinate format (cx, cy, w, h) and shape |
|
[nb_dec, bs, num_query, 4]. |
|
img_metas (list[dict]): Meta information of each image. |
|
rescale (bool, optional): If True, return boxes in original |
|
image space. Default False. |
|
|
|
Returns: |
|
list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \ |
|
The first item is an (n, 5) tensor, where the first 4 columns \ |
|
are bounding box positions (tl_x, tl_y, br_x, br_y) and the \ |
|
5-th column is a score between 0 and 1. The second item is a \ |
|
(n,) tensor where each item is the predicted class label of \ |
|
the corresponding box. |
|
""" |
|
|
|
|
|
cls_scores = all_cls_scores_list[-1][-1] |
|
bbox_preds = all_bbox_preds_list[-1][-1] |
|
|
|
result_list = [] |
|
for img_id in range(len(img_metas)): |
|
cls_score = cls_scores[img_id] |
|
bbox_pred = bbox_preds[img_id] |
|
img_shape = img_metas[img_id]['img_shape'] |
|
scale_factor = img_metas[img_id]['scale_factor'] |
|
proposals = self._get_bboxes_single(cls_score, bbox_pred, |
|
img_shape, scale_factor, |
|
rescale) |
|
result_list.append(proposals) |
|
return result_list |
|
|
|
def _get_bboxes_single(self, |
|
cls_score, |
|
bbox_pred, |
|
img_shape, |
|
scale_factor, |
|
rescale=False): |
|
"""Transform outputs from the last decoder layer into bbox predictions |
|
for each image. |
|
|
|
Args: |
|
cls_score (Tensor): Box score logits from the last decoder layer |
|
for each image. Shape [num_query, cls_out_channels]. |
|
bbox_pred (Tensor): Sigmoid outputs from the last decoder layer |
|
for each image, with coordinate format (cx, cy, w, h) and |
|
shape [num_query, 4]. |
|
img_shape (tuple[int]): Shape of input image, (height, width, 3). |
|
scale_factor (ndarray, optional): Scale factor of the image arange |
|
as (w_scale, h_scale, w_scale, h_scale). |
|
rescale (bool, optional): If True, return boxes in original image |
|
space. Default False. |
|
|
|
Returns: |
|
tuple[Tensor]: Results of detected bboxes and labels. |
|
|
|
- det_bboxes: Predicted bboxes with shape [num_query, 5], \ |
|
where the first 4 columns are bounding box positions \ |
|
(tl_x, tl_y, br_x, br_y) and the 5-th column are scores \ |
|
between 0 and 1. |
|
- det_labels: Predicted labels of the corresponding box with \ |
|
shape [num_query]. |
|
""" |
|
assert len(cls_score) == len(bbox_pred) |
|
|
|
scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1) |
|
det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred) |
|
det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1] |
|
det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0] |
|
det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1]) |
|
det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0]) |
|
if rescale: |
|
det_bboxes /= det_bboxes.new_tensor(scale_factor) |
|
det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(1)), -1) |
|
return det_bboxes, det_labels |
|
|