|
|
|
|
|
import functools
|
|
import io
|
|
import struct
|
|
import types
|
|
import torch
|
|
|
|
from detectron2.modeling import meta_arch
|
|
from detectron2.modeling.box_regression import Box2BoxTransform
|
|
from detectron2.modeling.roi_heads import keypoint_head
|
|
from detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes
|
|
|
|
from .c10 import Caffe2Compatible
|
|
from .caffe2_patch import ROIHeadsPatcher, patch_generalized_rcnn
|
|
from .shared import (
|
|
alias,
|
|
check_set_pb_arg,
|
|
get_pb_arg_floats,
|
|
get_pb_arg_valf,
|
|
get_pb_arg_vali,
|
|
get_pb_arg_vals,
|
|
mock_torch_nn_functional_interpolate,
|
|
)
|
|
|
|
|
|
def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False):
|
|
"""
|
|
A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor])
|
|
to detectron2's format (i.e. list of Instances instance).
|
|
This only works when the model follows the Caffe2 detectron's naming convention.
|
|
|
|
Args:
|
|
image_sizes (List[List[int, int]]): [H, W] of every image.
|
|
tensor_outputs (Dict[str, Tensor]): external_output to its tensor.
|
|
|
|
force_mask_on (Bool): if true, the it make sure there'll be pred_masks even
|
|
if the mask is not found from tensor_outputs (usually due to model crash)
|
|
"""
|
|
|
|
results = [Instances(image_size) for image_size in image_sizes]
|
|
|
|
batch_splits = tensor_outputs.get("batch_splits", None)
|
|
if batch_splits:
|
|
raise NotImplementedError()
|
|
assert len(image_sizes) == 1
|
|
result = results[0]
|
|
|
|
bbox_nms = tensor_outputs["bbox_nms"]
|
|
score_nms = tensor_outputs["score_nms"]
|
|
class_nms = tensor_outputs["class_nms"]
|
|
|
|
assert bbox_nms is not None
|
|
assert score_nms is not None
|
|
assert class_nms is not None
|
|
if bbox_nms.shape[1] == 5:
|
|
result.pred_boxes = RotatedBoxes(bbox_nms)
|
|
else:
|
|
result.pred_boxes = Boxes(bbox_nms)
|
|
result.scores = score_nms
|
|
result.pred_classes = class_nms.to(torch.int64)
|
|
|
|
mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None)
|
|
if mask_fcn_probs is not None:
|
|
|
|
mask_probs_pred = mask_fcn_probs
|
|
num_masks = mask_probs_pred.shape[0]
|
|
class_pred = result.pred_classes
|
|
indices = torch.arange(num_masks, device=class_pred.device)
|
|
mask_probs_pred = mask_probs_pred[indices, class_pred][:, None]
|
|
result.pred_masks = mask_probs_pred
|
|
elif force_mask_on:
|
|
|
|
|
|
result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8)
|
|
|
|
keypoints_out = tensor_outputs.get("keypoints_out", None)
|
|
kps_score = tensor_outputs.get("kps_score", None)
|
|
if keypoints_out is not None:
|
|
|
|
keypoints_tensor = keypoints_out
|
|
|
|
|
|
|
|
keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]]
|
|
result.pred_keypoints = keypoint_xyp
|
|
elif kps_score is not None:
|
|
|
|
pred_keypoint_logits = kps_score
|
|
keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result])
|
|
|
|
return results
|
|
|
|
|
|
def _cast_to_f32(f64):
|
|
return struct.unpack("f", struct.pack("f", f64))[0]
|
|
|
|
|
|
def set_caffe2_compatible_tensor_mode(model, enable=True):
|
|
def _fn(m):
|
|
if isinstance(m, Caffe2Compatible):
|
|
m.tensor_mode = enable
|
|
|
|
model.apply(_fn)
|
|
|
|
|
|
def convert_batched_inputs_to_c2_format(batched_inputs, size_divisibility, device):
|
|
"""
|
|
See get_caffe2_inputs() below.
|
|
"""
|
|
assert all(isinstance(x, dict) for x in batched_inputs)
|
|
assert all(x["image"].dim() == 3 for x in batched_inputs)
|
|
|
|
images = [x["image"] for x in batched_inputs]
|
|
images = ImageList.from_tensors(images, size_divisibility)
|
|
|
|
im_info = []
|
|
for input_per_image, image_size in zip(batched_inputs, images.image_sizes):
|
|
target_height = input_per_image.get("height", image_size[0])
|
|
target_width = input_per_image.get("width", image_size[1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scale = target_height / image_size[0]
|
|
im_info.append([image_size[0], image_size[1], scale])
|
|
im_info = torch.Tensor(im_info)
|
|
|
|
return images.tensor.to(device), im_info.to(device)
|
|
|
|
|
|
class Caffe2MetaArch(Caffe2Compatible, torch.nn.Module):
|
|
"""
|
|
Base class for caffe2-compatible implementation of a meta architecture.
|
|
The forward is traceable and its traced graph can be converted to caffe2
|
|
graph through ONNX.
|
|
"""
|
|
|
|
def __init__(self, cfg, torch_model, enable_tensor_mode=True):
|
|
"""
|
|
Args:
|
|
cfg (CfgNode):
|
|
torch_model (nn.Module): the detectron2 model (meta_arch) to be
|
|
converted.
|
|
"""
|
|
super().__init__()
|
|
self._wrapped_model = torch_model
|
|
self.eval()
|
|
set_caffe2_compatible_tensor_mode(self, enable_tensor_mode)
|
|
|
|
def get_caffe2_inputs(self, batched_inputs):
|
|
"""
|
|
Convert pytorch-style structured inputs to caffe2-style inputs that
|
|
are tuples of tensors.
|
|
|
|
Args:
|
|
batched_inputs (list[dict]): inputs to a detectron2 model
|
|
in its standard format. Each dict has "image" (CHW tensor), and optionally
|
|
"height" and "width".
|
|
|
|
Returns:
|
|
tuple[Tensor]:
|
|
tuple of tensors that will be the inputs to the
|
|
:meth:`forward` method. For existing models, the first
|
|
is an NCHW tensor (padded and batched); the second is
|
|
a im_info Nx3 tensor, where the rows are
|
|
(height, width, unused legacy parameter)
|
|
"""
|
|
return convert_batched_inputs_to_c2_format(
|
|
batched_inputs,
|
|
self._wrapped_model.backbone.size_divisibility,
|
|
self._wrapped_model.device,
|
|
)
|
|
|
|
def encode_additional_info(self, predict_net, init_net):
|
|
"""
|
|
Save extra metadata that will be used by inference in the output protobuf.
|
|
"""
|
|
pass
|
|
|
|
def forward(self, inputs):
|
|
"""
|
|
Run the forward in caffe2-style. It has to use caffe2-compatible ops
|
|
and the method will be used for tracing.
|
|
|
|
Args:
|
|
inputs (tuple[Tensor]): inputs defined by :meth:`get_caffe2_input`.
|
|
They will be the inputs of the converted caffe2 graph.
|
|
|
|
Returns:
|
|
tuple[Tensor]: output tensors. They will be the outputs of the
|
|
converted caffe2 graph.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def _caffe2_preprocess_image(self, inputs):
|
|
"""
|
|
Caffe2 implementation of preprocess_image, which is called inside each MetaArch's forward.
|
|
It normalizes the input images, and the final caffe2 graph assumes the
|
|
inputs have been batched already.
|
|
"""
|
|
data, im_info = inputs
|
|
data = alias(data, "data")
|
|
im_info = alias(im_info, "im_info")
|
|
mean, std = self._wrapped_model.pixel_mean, self._wrapped_model.pixel_std
|
|
normalized_data = (data - mean) / std
|
|
normalized_data = alias(normalized_data, "normalized_data")
|
|
|
|
|
|
images = ImageList(tensor=normalized_data, image_sizes=im_info)
|
|
return images
|
|
|
|
@staticmethod
|
|
def get_outputs_converter(predict_net, init_net):
|
|
"""
|
|
Creates a function that converts outputs of the caffe2 model to
|
|
detectron2's standard format.
|
|
The function uses information in `predict_net` and `init_net` that are
|
|
available at inferene time. Therefore the function logic can be used in inference.
|
|
|
|
The returned function has the following signature:
|
|
|
|
def convert(batched_inputs, c2_inputs, c2_results) -> detectron2_outputs
|
|
|
|
Where
|
|
|
|
* batched_inputs (list[dict]): the original input format of the meta arch
|
|
* c2_inputs (tuple[Tensor]): the caffe2 inputs.
|
|
* c2_results (dict[str, Tensor]): the caffe2 output format,
|
|
corresponding to the outputs of the :meth:`forward` function.
|
|
* detectron2_outputs: the original output format of the meta arch.
|
|
|
|
This function can be used to compare the outputs of the original meta arch and
|
|
the converted caffe2 graph.
|
|
|
|
Returns:
|
|
callable: a callable of the above signature.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class Caffe2GeneralizedRCNN(Caffe2MetaArch):
|
|
def __init__(self, cfg, torch_model, enable_tensor_mode=True):
|
|
assert isinstance(torch_model, meta_arch.GeneralizedRCNN)
|
|
torch_model = patch_generalized_rcnn(torch_model)
|
|
super().__init__(cfg, torch_model, enable_tensor_mode)
|
|
|
|
try:
|
|
use_heatmap_max_keypoint = cfg.EXPORT_CAFFE2.USE_HEATMAP_MAX_KEYPOINT
|
|
except AttributeError:
|
|
use_heatmap_max_keypoint = False
|
|
self.roi_heads_patcher = ROIHeadsPatcher(
|
|
self._wrapped_model.roi_heads, use_heatmap_max_keypoint
|
|
)
|
|
if self.tensor_mode:
|
|
self.roi_heads_patcher.patch_roi_heads()
|
|
|
|
def encode_additional_info(self, predict_net, init_net):
|
|
size_divisibility = self._wrapped_model.backbone.size_divisibility
|
|
check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
|
|
check_set_pb_arg(
|
|
predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
|
|
)
|
|
check_set_pb_arg(predict_net, "meta_architecture", "s", b"GeneralizedRCNN")
|
|
|
|
@mock_torch_nn_functional_interpolate()
|
|
def forward(self, inputs):
|
|
if not self.tensor_mode:
|
|
return self._wrapped_model.inference(inputs)
|
|
images = self._caffe2_preprocess_image(inputs)
|
|
features = self._wrapped_model.backbone(images.tensor)
|
|
proposals, _ = self._wrapped_model.proposal_generator(images, features)
|
|
detector_results, _ = self._wrapped_model.roi_heads(images, features, proposals)
|
|
return tuple(detector_results[0].flatten())
|
|
|
|
@staticmethod
|
|
def get_outputs_converter(predict_net, init_net):
|
|
def f(batched_inputs, c2_inputs, c2_results):
|
|
_, im_info = c2_inputs
|
|
image_sizes = [[int(im[0]), int(im[1])] for im in im_info]
|
|
results = assemble_rcnn_outputs_by_name(image_sizes, c2_results)
|
|
return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
|
|
|
|
return f
|
|
|
|
|
|
class Caffe2RetinaNet(Caffe2MetaArch):
|
|
def __init__(self, cfg, torch_model):
|
|
assert isinstance(torch_model, meta_arch.RetinaNet)
|
|
super().__init__(cfg, torch_model)
|
|
|
|
@mock_torch_nn_functional_interpolate()
|
|
def forward(self, inputs):
|
|
assert self.tensor_mode
|
|
images = self._caffe2_preprocess_image(inputs)
|
|
|
|
|
|
|
|
return_tensors = [images.image_sizes]
|
|
|
|
features = self._wrapped_model.backbone(images.tensor)
|
|
features = [features[f] for f in self._wrapped_model.head_in_features]
|
|
for i, feature_i in enumerate(features):
|
|
features[i] = alias(feature_i, "feature_{}".format(i), is_backward=True)
|
|
return_tensors.append(features[i])
|
|
|
|
pred_logits, pred_anchor_deltas = self._wrapped_model.head(features)
|
|
for i, (box_cls_i, box_delta_i) in enumerate(zip(pred_logits, pred_anchor_deltas)):
|
|
return_tensors.append(alias(box_cls_i, "box_cls_{}".format(i)))
|
|
return_tensors.append(alias(box_delta_i, "box_delta_{}".format(i)))
|
|
|
|
return tuple(return_tensors)
|
|
|
|
def encode_additional_info(self, predict_net, init_net):
|
|
size_divisibility = self._wrapped_model.backbone.size_divisibility
|
|
check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility)
|
|
check_set_pb_arg(
|
|
predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii")
|
|
)
|
|
check_set_pb_arg(predict_net, "meta_architecture", "s", b"RetinaNet")
|
|
|
|
|
|
check_set_pb_arg(
|
|
predict_net, "score_threshold", "f", _cast_to_f32(self._wrapped_model.test_score_thresh)
|
|
)
|
|
check_set_pb_arg(
|
|
predict_net, "topk_candidates", "i", self._wrapped_model.test_topk_candidates
|
|
)
|
|
check_set_pb_arg(
|
|
predict_net, "nms_threshold", "f", _cast_to_f32(self._wrapped_model.test_nms_thresh)
|
|
)
|
|
check_set_pb_arg(
|
|
predict_net,
|
|
"max_detections_per_image",
|
|
"i",
|
|
self._wrapped_model.max_detections_per_image,
|
|
)
|
|
|
|
check_set_pb_arg(
|
|
predict_net,
|
|
"bbox_reg_weights",
|
|
"floats",
|
|
[_cast_to_f32(w) for w in self._wrapped_model.box2box_transform.weights],
|
|
)
|
|
self._encode_anchor_generator_cfg(predict_net)
|
|
|
|
def _encode_anchor_generator_cfg(self, predict_net):
|
|
|
|
serialized_anchor_generator = io.BytesIO()
|
|
torch.save(self._wrapped_model.anchor_generator, serialized_anchor_generator)
|
|
|
|
|
|
bytes = serialized_anchor_generator.getvalue()
|
|
check_set_pb_arg(predict_net, "serialized_anchor_generator", "s", bytes)
|
|
|
|
@staticmethod
|
|
def get_outputs_converter(predict_net, init_net):
|
|
self = types.SimpleNamespace()
|
|
serialized_anchor_generator = io.BytesIO(
|
|
get_pb_arg_vals(predict_net, "serialized_anchor_generator", None)
|
|
)
|
|
self.anchor_generator = torch.load(serialized_anchor_generator)
|
|
bbox_reg_weights = get_pb_arg_floats(predict_net, "bbox_reg_weights", None)
|
|
self.box2box_transform = Box2BoxTransform(weights=tuple(bbox_reg_weights))
|
|
self.test_score_thresh = get_pb_arg_valf(predict_net, "score_threshold", None)
|
|
self.test_topk_candidates = get_pb_arg_vali(predict_net, "topk_candidates", None)
|
|
self.test_nms_thresh = get_pb_arg_valf(predict_net, "nms_threshold", None)
|
|
self.max_detections_per_image = get_pb_arg_vali(
|
|
predict_net, "max_detections_per_image", None
|
|
)
|
|
|
|
|
|
for meth in [
|
|
"forward_inference",
|
|
"inference_single_image",
|
|
"_transpose_dense_predictions",
|
|
"_decode_multi_level_predictions",
|
|
"_decode_per_level_predictions",
|
|
]:
|
|
setattr(self, meth, functools.partial(getattr(meta_arch.RetinaNet, meth), self))
|
|
|
|
def f(batched_inputs, c2_inputs, c2_results):
|
|
_, im_info = c2_inputs
|
|
image_sizes = [[int(im[0]), int(im[1])] for im in im_info]
|
|
dummy_images = ImageList(
|
|
torch.randn(
|
|
(
|
|
len(im_info),
|
|
3,
|
|
)
|
|
+ tuple(image_sizes[0])
|
|
),
|
|
image_sizes,
|
|
)
|
|
|
|
num_features = len([x for x in c2_results.keys() if x.startswith("box_cls_")])
|
|
pred_logits = [c2_results["box_cls_{}".format(i)] for i in range(num_features)]
|
|
pred_anchor_deltas = [c2_results["box_delta_{}".format(i)] for i in range(num_features)]
|
|
|
|
|
|
|
|
dummy_features = [x.clone()[:, 0:0, :, :] for x in pred_logits]
|
|
|
|
self.num_classes = pred_logits[0].shape[1] // (pred_anchor_deltas[0].shape[1] // 4)
|
|
|
|
results = self.forward_inference(
|
|
dummy_images, dummy_features, [pred_logits, pred_anchor_deltas]
|
|
)
|
|
return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes)
|
|
|
|
return f
|
|
|
|
|
|
META_ARCH_CAFFE2_EXPORT_TYPE_MAP = {
|
|
"GeneralizedRCNN": Caffe2GeneralizedRCNN,
|
|
"RetinaNet": Caffe2RetinaNet,
|
|
}
|
|
|