ZJF-Thunder
/

Swin-Transformer-Object-Detection

Model card Files Files and versions Community

Swin-Transformer-Object-Detection / mmdet /models /dense_heads /transformer_head.py

ZJF-Thunder

添加文件

e26e560 almost 2 years ago

31 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from mmcv.cnn import Conv2d, Linear, build_activation_layer
	from mmcv.runner import force_fp32

	from mmdet.core import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh,
	build_assigner, build_sampler, multi_apply,
	reduce_mean)
	from mmdet.models.utils import (FFN, build_positional_encoding,
	build_transformer)
	from ..builder import HEADS, build_loss
	from .anchor_free_head import AnchorFreeHead


	@HEADS.register_module()
	class TransformerHead(AnchorFreeHead):
	"""Implements the DETR transformer head.

	See `paper: End-to-End Object Detection with Transformers
	<https://arxiv.org/pdf/2005.12872>`_ for details.

	Args:
	num_classes (int): Number of categories excluding the background.
	in_channels (int): Number of channels in the input feature map.
	num_fcs (int, optional): Number of fully-connected layers used in
	`FFN`, which is then used for the regression head. Default 2.
	transformer (dict, optional): Config for transformer.
	positional_encoding (dict, optional): Config for position encoding.
	loss_cls (dict, optional): Config of the classification loss.
	Default `CrossEntropyLoss`.
	loss_bbox (dict, optional): Config of the regression loss.
	Default `L1Loss`.
	loss_iou (dict, optional): Config of the regression iou loss.
	Default `GIoULoss`.
	tran_cfg (dict, optional): Training config of transformer head.
	test_cfg (dict, optional): Testing config of transformer head.

	Example:
	>>> import torch
	>>> self = TransformerHead(80, 2048)
	>>> x = torch.rand(1, 2048, 32, 32)
	>>> mask = torch.ones(1, 32, 32).to(x.dtype)
	>>> mask[:, :16, :15] = 0
	>>> all_cls_scores, all_bbox_preds = self(x, mask)
	"""

	def __init__(self,
	num_classes,
	in_channels,
	num_fcs=2,
	transformer=dict(
	type='Transformer',
	embed_dims=256,
	num_heads=8,
	num_encoder_layers=6,
	num_decoder_layers=6,
	feedforward_channels=2048,
	dropout=0.1,
	act_cfg=dict(type='ReLU', inplace=True),
	norm_cfg=dict(type='LN'),
	num_fcs=2,
	pre_norm=False,
	return_intermediate_dec=True),
	positional_encoding=dict(
	type='SinePositionalEncoding',
	num_feats=128,
	normalize=True),
	loss_cls=dict(
	type='CrossEntropyLoss',
	bg_cls_weight=0.1,
	use_sigmoid=False,
	loss_weight=1.0,
	class_weight=1.0),
	loss_bbox=dict(type='L1Loss', loss_weight=5.0),
	loss_iou=dict(type='GIoULoss', loss_weight=2.0),
	train_cfg=dict(
	assigner=dict(
	type='HungarianAssigner',
	cls_cost=dict(type='ClassificationCost', weight=1.),
	reg_cost=dict(type='BBoxL1Cost', weight=5.0),
	iou_cost=dict(
	type='IoUCost', iou_mode='giou', weight=2.0))),
	test_cfg=dict(max_per_img=100),
	**kwargs):
	# NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
	# since it brings inconvenience when the initialization of
	# `AnchorFreeHead` is called.
	super(AnchorFreeHead, self).__init__()
	use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
	assert not use_sigmoid_cls, 'setting use_sigmoid_cls as True is ' \
	'not supported in DETR, since background is needed for the ' \
	'matching process.'
	assert 'embed_dims' in transformer \
	and 'num_feats' in positional_encoding
	num_feats = positional_encoding['num_feats']
	embed_dims = transformer['embed_dims']
	assert num_feats * 2 == embed_dims, 'embed_dims should' \
	f' be exactly 2 times of num_feats. Found {embed_dims}' \
	f' and {num_feats}.'
	assert test_cfg is not None and 'max_per_img' in test_cfg

	class_weight = loss_cls.get('class_weight', None)
	if class_weight is not None:
	assert isinstance(class_weight, float), 'Expected ' \
	'class_weight to have type float. Found ' \
	f'{type(class_weight)}.'
	# NOTE following the official DETR rep0, bg_cls_weight means
	# relative classification weight of the no-object class.
	bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
	assert isinstance(bg_cls_weight, float), 'Expected ' \
	'bg_cls_weight to have type float. Found ' \
	f'{type(bg_cls_weight)}.'
	class_weight = torch.ones(num_classes + 1) * class_weight
	# set background class as the last indice
	class_weight[num_classes] = bg_cls_weight
	loss_cls.update({'class_weight': class_weight})
	if 'bg_cls_weight' in loss_cls:
	loss_cls.pop('bg_cls_weight')
	self.bg_cls_weight = bg_cls_weight

	if train_cfg:
	assert 'assigner' in train_cfg, 'assigner should be provided '\
	'when train_cfg is set.'
	assigner = train_cfg['assigner']
	assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \
	'The classification weight for loss and matcher should be' \
	'exactly the same.'
	assert loss_bbox['loss_weight'] == assigner['reg_cost'][
	'weight'], 'The regression L1 weight for loss and matcher ' \
	'should be exactly the same.'
	assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'], \
	'The regression iou weight for loss and matcher should be' \
	'exactly the same.'
	self.assigner = build_assigner(assigner)
	# DETR sampling=False, so use PseudoSampler
	sampler_cfg = dict(type='PseudoSampler')
	self.sampler = build_sampler(sampler_cfg, context=self)
	self.num_classes = num_classes
	self.cls_out_channels = num_classes + 1
	self.in_channels = in_channels
	self.num_fcs = num_fcs
	self.train_cfg = train_cfg
	self.test_cfg = test_cfg
	self.use_sigmoid_cls = use_sigmoid_cls
	self.embed_dims = embed_dims
	self.num_query = test_cfg['max_per_img']
	self.fp16_enabled = False
	self.loss_cls = build_loss(loss_cls)
	self.loss_bbox = build_loss(loss_bbox)
	self.loss_iou = build_loss(loss_iou)
	self.act_cfg = transformer.get('act_cfg',
	dict(type='ReLU', inplace=True))
	self.activate = build_activation_layer(self.act_cfg)
	self.positional_encoding = build_positional_encoding(
	positional_encoding)
	self.transformer = build_transformer(transformer)
	self._init_layers()

	def _init_layers(self):
	"""Initialize layers of the transformer head."""
	self.input_proj = Conv2d(
	self.in_channels, self.embed_dims, kernel_size=1)
	self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
	self.reg_ffn = FFN(
	self.embed_dims,
	self.embed_dims,
	self.num_fcs,
	self.act_cfg,
	dropout=0.0,
	add_residual=False)
	self.fc_reg = Linear(self.embed_dims, 4)
	self.query_embedding = nn.Embedding(self.num_query, self.embed_dims)

	def init_weights(self, distribution='uniform'):
	"""Initialize weights of the transformer head."""
	# The initialization for transformer is important
	self.transformer.init_weights()

	def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
	missing_keys, unexpected_keys, error_msgs):
	"""load checkpoints."""
	# NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
	# since `AnchorFreeHead._load_from_state_dict` should not be
	# called here. Invoking the default `Module._load_from_state_dict`
	# is enough.
	super(AnchorFreeHead,
	self)._load_from_state_dict(state_dict, prefix, local_metadata,
	strict, missing_keys,
	unexpected_keys, error_msgs)

	def forward(self, feats, img_metas):
	"""Forward function.

	Args:
	feats (tuple[Tensor]): Features from the upstream network, each is
	a 4D-tensor.
	img_metas (list[dict]): List of image information.

	Returns:
	tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.

	- all_cls_scores_list (list[Tensor]): Classification scores \
	for each scale level. Each is a 4D-tensor with shape \
	[nb_dec, bs, num_query, cls_out_channels]. Note \
	`cls_out_channels` should includes background.
	- all_bbox_preds_list (list[Tensor]): Sigmoid regression \
	outputs for each scale level. Each is a 4D-tensor with \
	normalized coordinate format (cx, cy, w, h) and shape \
	[nb_dec, bs, num_query, 4].
	"""
	num_levels = len(feats)
	img_metas_list = [img_metas for _ in range(num_levels)]
	return multi_apply(self.forward_single, feats, img_metas_list)

	def forward_single(self, x, img_metas):
	""""Forward function for a single feature level.

	Args:
	x (Tensor): Input feature from backbone's single stage, shape
	[bs, c, h, w].
	img_metas (list[dict]): List of image information.

	Returns:
	all_cls_scores (Tensor): Outputs from the classification head,
	shape [nb_dec, bs, num_query, cls_out_channels]. Note
	cls_out_channels should includes background.
	all_bbox_preds (Tensor): Sigmoid outputs from the regression
	head with normalized coordinate format (cx, cy, w, h).
	Shape [nb_dec, bs, num_query, 4].
	"""
	# construct binary masks which used for the transformer.
	# NOTE following the official DETR repo, non-zero values representing
	# ignored positions, while zero values means valid positions.
	batch_size = x.size(0)
	input_img_h, input_img_w = img_metas[0]['batch_input_shape']
	masks = x.new_ones((batch_size, input_img_h, input_img_w))
	for img_id in range(batch_size):
	img_h, img_w, _ = img_metas[img_id]['img_shape']
	masks[img_id, :img_h, :img_w] = 0

	x = self.input_proj(x)
	# interpolate masks to have the same spatial shape with x
	masks = F.interpolate(
	masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1)
	# position encoding
	pos_embed = self.positional_encoding(masks) # [bs, embed_dim, h, w]
	# outs_dec: [nb_dec, bs, num_query, embed_dim]
	outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
	pos_embed)

	all_cls_scores = self.fc_cls(outs_dec)
	all_bbox_preds = self.fc_reg(self.activate(
	self.reg_ffn(outs_dec))).sigmoid()
	return all_cls_scores, all_bbox_preds

	@force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
	def loss(self,
	all_cls_scores_list,
	all_bbox_preds_list,
	gt_bboxes_list,
	gt_labels_list,
	img_metas,
	gt_bboxes_ignore=None):
	""""Loss function.

	Only outputs from the last feature level are used for computing
	losses by default.

	Args:
	all_cls_scores_list (list[Tensor]): Classification outputs
	for each feature level. Each is a 4D-tensor with shape
	[nb_dec, bs, num_query, cls_out_channels].
	all_bbox_preds_list (list[Tensor]): Sigmoid regression
	outputs for each feature level. Each is a 4D-tensor with
	normalized coordinate format (cx, cy, w, h) and shape
	[nb_dec, bs, num_query, 4].
	gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
	with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
	gt_labels_list (list[Tensor]): Ground truth class indices for each
	image with shape (num_gts, ).
	img_metas (list[dict]): List of image meta information.
	gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
	which can be ignored for each image. Default None.

	Returns:
	dict[str, Tensor]: A dictionary of loss components.
	"""
	# NOTE defaultly only the outputs from the last feature scale is used.
	all_cls_scores = all_cls_scores_list[-1]
	all_bbox_preds = all_bbox_preds_list[-1]
	assert gt_bboxes_ignore is None, \
	'Only supports for gt_bboxes_ignore setting to None.'

	num_dec_layers = len(all_cls_scores)
	all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
	all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
	all_gt_bboxes_ignore_list = [
	gt_bboxes_ignore for _ in range(num_dec_layers)
	]
	img_metas_list = [img_metas for _ in range(num_dec_layers)]

	losses_cls, losses_bbox, losses_iou = multi_apply(
	self.loss_single, all_cls_scores, all_bbox_preds,
	all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
	all_gt_bboxes_ignore_list)

	loss_dict = dict()
	# loss from the last decoder layer
	loss_dict['loss_cls'] = losses_cls[-1]
	loss_dict['loss_bbox'] = losses_bbox[-1]
	loss_dict['loss_iou'] = losses_iou[-1]
	# loss from other decoder layers
	num_dec_layer = 0
	for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
	losses_bbox[:-1],
	losses_iou[:-1]):
	loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
	loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
	loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
	num_dec_layer += 1
	return loss_dict

	def loss_single(self,
	cls_scores,
	bbox_preds,
	gt_bboxes_list,
	gt_labels_list,
	img_metas,
	gt_bboxes_ignore_list=None):
	""""Loss function for outputs from a single decoder layer of a single
	feature level.

	Args:
	cls_scores (Tensor): Box score logits from a single decoder layer
	for all images. Shape [bs, num_query, cls_out_channels].
	bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
	for all images, with normalized coordinate (cx, cy, w, h) and
	shape [bs, num_query, 4].
	gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
	with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
	gt_labels_list (list[Tensor]): Ground truth class indices for each
	image with shape (num_gts, ).
	img_metas (list[dict]): List of image meta information.
	gt_bboxes_ignore_list (list[Tensor], optional): Bounding
	boxes which can be ignored for each image. Default None.

	Returns:
	dict[str, Tensor]: A dictionary of loss components for outputs from
	a single decoder layer.
	"""
	num_imgs = cls_scores.size(0)
	cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
	bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
	cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
	gt_bboxes_list, gt_labels_list,
	img_metas, gt_bboxes_ignore_list)
	(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
	num_total_pos, num_total_neg) = cls_reg_targets
	labels = torch.cat(labels_list, 0)
	label_weights = torch.cat(label_weights_list, 0)
	bbox_targets = torch.cat(bbox_targets_list, 0)
	bbox_weights = torch.cat(bbox_weights_list, 0)

	# classification loss
	cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
	# construct weighted avg_factor to match with the official DETR repo
	cls_avg_factor = num_total_pos * 1.0 + \
	num_total_neg * self.bg_cls_weight
	loss_cls = self.loss_cls(
	cls_scores, labels, label_weights, avg_factor=cls_avg_factor)

	# Compute the average number of gt boxes accross all gpus, for
	# normalization purposes
	num_total_pos = loss_cls.new_tensor([num_total_pos])
	num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()

	# construct factors used for rescale bboxes
	factors = []
	for img_meta, bbox_pred in zip(img_metas, bbox_preds):
	img_h, img_w, _ = img_meta['img_shape']
	factor = bbox_pred.new_tensor([img_w, img_h, img_w,
	img_h]).unsqueeze(0).repeat(
	bbox_pred.size(0), 1)
	factors.append(factor)
	factors = torch.cat(factors, 0)

	# DETR regress the relative position of boxes (cxcywh) in the image,
	# thus the learning target is normalized by the image size. So here
	# we need to re-scale them for calculating IoU loss
	bbox_preds = bbox_preds.reshape(-1, 4)
	bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
	bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors

	# regression IoU loss, defaultly GIoU loss
	loss_iou = self.loss_iou(
	bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)

	# regression L1 loss
	loss_bbox = self.loss_bbox(
	bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
	return loss_cls, loss_bbox, loss_iou

	def get_targets(self,
	cls_scores_list,
	bbox_preds_list,
	gt_bboxes_list,
	gt_labels_list,
	img_metas,
	gt_bboxes_ignore_list=None):
	""""Compute regression and classification targets for a batch image.

	Outputs from a single decoder layer of a single feature level are used.

	Args:
	cls_scores_list (list[Tensor]): Box score logits from a single
	decoder layer for each image with shape [num_query,
	cls_out_channels].
	bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
	decoder layer for each image, with normalized coordinate
	(cx, cy, w, h) and shape [num_query, 4].
	gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
	with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
	gt_labels_list (list[Tensor]): Ground truth class indices for each
	image with shape (num_gts, ).
	img_metas (list[dict]): List of image meta information.
	gt_bboxes_ignore_list (list[Tensor], optional): Bounding
	boxes which can be ignored for each image. Default None.

	Returns:
	tuple: a tuple containing the following targets.

	- labels_list (list[Tensor]): Labels for all images.
	- label_weights_list (list[Tensor]): Label weights for all \
	images.
	- bbox_targets_list (list[Tensor]): BBox targets for all \
	images.
	- bbox_weights_list (list[Tensor]): BBox weights for all \
	images.
	- num_total_pos (int): Number of positive samples in all \
	images.
	- num_total_neg (int): Number of negative samples in all \
	images.
	"""
	assert gt_bboxes_ignore_list is None, \
	'Only supports for gt_bboxes_ignore setting to None.'
	num_imgs = len(cls_scores_list)
	gt_bboxes_ignore_list = [
	gt_bboxes_ignore_list for _ in range(num_imgs)
	]

	(labels_list, label_weights_list, bbox_targets_list,
	bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
	self._get_target_single, cls_scores_list, bbox_preds_list,
	gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list)
	num_total_pos = sum((inds.numel() for inds in pos_inds_list))
	num_total_neg = sum((inds.numel() for inds in neg_inds_list))
	return (labels_list, label_weights_list, bbox_targets_list,
	bbox_weights_list, num_total_pos, num_total_neg)

	def _get_target_single(self,
	cls_score,
	bbox_pred,
	gt_bboxes,
	gt_labels,
	img_meta,
	gt_bboxes_ignore=None):
	""""Compute regression and classification targets for one image.

	Outputs from a single decoder layer of a single feature level are used.

	Args:
	cls_score (Tensor): Box score logits from a single decoder layer
	for one image. Shape [num_query, cls_out_channels].
	bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
	for one image, with normalized coordinate (cx, cy, w, h) and
	shape [num_query, 4].
	gt_bboxes (Tensor): Ground truth bboxes for one image with
	shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
	gt_labels (Tensor): Ground truth class indices for one image
	with shape (num_gts, ).
	img_meta (dict): Meta information for one image.
	gt_bboxes_ignore (Tensor, optional): Bounding boxes
	which can be ignored. Default None.

	Returns:
	tuple[Tensor]: a tuple containing the following for one image.

	- labels (Tensor): Labels of each image.
	- label_weights (Tensor]): Label weights of each image.
	- bbox_targets (Tensor): BBox targets of each image.
	- bbox_weights (Tensor): BBox weights of each image.
	- pos_inds (Tensor): Sampled positive indices for each image.
	- neg_inds (Tensor): Sampled negative indices for each image.
	"""

	num_bboxes = bbox_pred.size(0)
	# assigner and sampler
	assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
	gt_labels, img_meta,
	gt_bboxes_ignore)
	sampling_result = self.sampler.sample(assign_result, bbox_pred,
	gt_bboxes)
	pos_inds = sampling_result.pos_inds
	neg_inds = sampling_result.neg_inds

	# label targets
	labels = gt_bboxes.new_full((num_bboxes, ),
	self.num_classes,
	dtype=torch.long)
	labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
	label_weights = gt_bboxes.new_ones(num_bboxes)

	# bbox targets
	bbox_targets = torch.zeros_like(bbox_pred)
	bbox_weights = torch.zeros_like(bbox_pred)
	bbox_weights[pos_inds] = 1.0
	img_h, img_w, _ = img_meta['img_shape']

	# DETR regress the relative position of boxes (cxcywh) in the image.
	# Thus the learning target should be normalized by the image size, also
	# the box format should be converted from defaultly x1y1x2y2 to cxcywh.
	factor = bbox_pred.new_tensor([img_w, img_h, img_w,
	img_h]).unsqueeze(0)
	pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor
	pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
	bbox_targets[pos_inds] = pos_gt_bboxes_targets
	return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
	neg_inds)

	# over-write because img_metas are needed as inputs for bbox_head.
	def forward_train(self,
	x,
	img_metas,
	gt_bboxes,
	gt_labels=None,
	gt_bboxes_ignore=None,
	proposal_cfg=None,
	**kwargs):
	"""Forward function for training mode.

	Args:
	x (list[Tensor]): Features from backbone.
	img_metas (list[dict]): Meta information of each image, e.g.,
	image size, scaling factor, etc.
	gt_bboxes (Tensor): Ground truth bboxes of the image,
	shape (num_gts, 4).
	gt_labels (Tensor): Ground truth labels of each box,
	shape (num_gts,).
	gt_bboxes_ignore (Tensor): Ground truth bboxes to be
	ignored, shape (num_ignored_gts, 4).
	proposal_cfg (mmcv.Config): Test / postprocessing configuration,
	if None, test_cfg would be used.

	Returns:
	dict[str, Tensor]: A dictionary of loss components.
	"""
	assert proposal_cfg is None, '"proposal_cfg" must be None'
	outs = self(x, img_metas)
	if gt_labels is None:
	loss_inputs = outs + (gt_bboxes, img_metas)
	else:
	loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
	losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
	return losses

	@force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
	def get_bboxes(self,
	all_cls_scores_list,
	all_bbox_preds_list,
	img_metas,
	rescale=False):
	"""Transform network outputs for a batch into bbox predictions.

	Args:
	all_cls_scores_list (list[Tensor]): Classification outputs
	for each feature level. Each is a 4D-tensor with shape
	[nb_dec, bs, num_query, cls_out_channels].
	all_bbox_preds_list (list[Tensor]): Sigmoid regression
	outputs for each feature level. Each is a 4D-tensor with
	normalized coordinate format (cx, cy, w, h) and shape
	[nb_dec, bs, num_query, 4].
	img_metas (list[dict]): Meta information of each image.
	rescale (bool, optional): If True, return boxes in original
	image space. Default False.

	Returns:
	list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
	The first item is an (n, 5) tensor, where the first 4 columns \
	are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
	5-th column is a score between 0 and 1. The second item is a \
	(n,) tensor where each item is the predicted class label of \
	the corresponding box.
	"""
	# NOTE defaultly only using outputs from the last feature level,
	# and only the outputs from the last decoder layer is used.
	cls_scores = all_cls_scores_list[-1][-1]
	bbox_preds = all_bbox_preds_list[-1][-1]

	result_list = []
	for img_id in range(len(img_metas)):
	cls_score = cls_scores[img_id]
	bbox_pred = bbox_preds[img_id]
	img_shape = img_metas[img_id]['img_shape']
	scale_factor = img_metas[img_id]['scale_factor']
	proposals = self._get_bboxes_single(cls_score, bbox_pred,
	img_shape, scale_factor,
	rescale)
	result_list.append(proposals)
	return result_list

	def _get_bboxes_single(self,
	cls_score,
	bbox_pred,
	img_shape,
	scale_factor,
	rescale=False):
	"""Transform outputs from the last decoder layer into bbox predictions
	for each image.

	Args:
	cls_score (Tensor): Box score logits from the last decoder layer
	for each image. Shape [num_query, cls_out_channels].
	bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
	for each image, with coordinate format (cx, cy, w, h) and
	shape [num_query, 4].
	img_shape (tuple[int]): Shape of input image, (height, width, 3).
	scale_factor (ndarray, optional): Scale factor of the image arange
	as (w_scale, h_scale, w_scale, h_scale).
	rescale (bool, optional): If True, return boxes in original image
	space. Default False.

	Returns:
	tuple[Tensor]: Results of detected bboxes and labels.

	- det_bboxes: Predicted bboxes with shape [num_query, 5], \
	where the first 4 columns are bounding box positions \
	(tl_x, tl_y, br_x, br_y) and the 5-th column are scores \
	between 0 and 1.
	- det_labels: Predicted labels of the corresponding box with \
	shape [num_query].
	"""
	assert len(cls_score) == len(bbox_pred)
	# exclude background
	scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
	det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
	det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
	det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
	det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
	det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
	if rescale:
	det_bboxes /= det_bboxes.new_tensor(scale_factor)
	det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(1)), -1)
	return det_bboxes, det_labels