Source code for mmdet.models.dense_heads.cascade_rpn_head
# Copyright (c) OpenMMLab. All rights reserved.
from __future__ import division
import copy
from typing import Dict, List, Optional, Tuple, Union
import torch
import torch.nn as nn
try:
from mmcv.ops import DeformConv2d
except ImportError:
class DeformConv2d:
def __init__(self, *args, **kwargs):
raise RuntimeError(
'DeformConv2d requires mmcv to be compiled with ops. Please '
'reinstall onedl-mmcv with CUDA support.')
from mmengine.config import ConfigDict
from mmengine.model import BaseModule, ModuleList
from mmengine.structures import InstanceData
from torch import Tensor
from mmdet.registry import MODELS, TASK_UTILS
from mmdet.structures import SampleList
from mmdet.utils import (ConfigType, InstanceList, MultiConfig,
OptInstanceList, OptMultiConfig)
from ..task_modules.assigners import RegionAssigner
from ..task_modules.samplers import PseudoSampler
from ..utils import (images_to_levels, multi_apply, select_single_mlvl,
unpack_gt_instances)
from .base_dense_head import BaseDenseHead
from .rpn_head import RPNHead
class AdaptiveConv(BaseModule):
"""AdaptiveConv used to adapt the sampling location with the anchors.
Args:
in_channels (int): Number of channels in the input image.
out_channels (int): Number of channels produced by the convolution.
kernel_size (int or tuple[int]): Size of the conv kernel.
Defaults to 3.
stride (int or tuple[int]): Stride of the convolution. Defaults to 1.
padding (int or tuple[int]): Zero-padding added to both sides of
the input. Defaults to 1.
dilation (int or tuple[int]): Spacing between kernel elements.
Defaults to 3.
groups (int): Number of blocked connections from input channels to
output channels. Defaults to 1.
bias (bool): If set True, adds a learnable bias to the output.
Defaults to False.
adapt_type (str): Type of adaptive conv, can be either ``offset``
(arbitrary anchors) or 'dilation' (uniform anchor).
Defaults to 'dilation'.
init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
list[dict]): Initialization config dict.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: Union[int, Tuple[int]] = 3,
stride: Union[int, Tuple[int]] = 1,
padding: Union[int, Tuple[int]] = 1,
dilation: Union[int, Tuple[int]] = 3,
groups: int = 1,
bias: bool = False,
adapt_type: str = 'dilation',
init_cfg: MultiConfig = dict(
type='Normal', std=0.01, override=dict(name='conv'))
) -> None:
super().__init__(init_cfg=init_cfg)
assert adapt_type in ['offset', 'dilation']
self.adapt_type = adapt_type
assert kernel_size == 3, 'Adaptive conv only supports kernels 3'
if self.adapt_type == 'offset':
assert stride == 1 and padding == 1 and groups == 1, \
'Adaptive conv offset mode only supports padding: {1}, ' \
f'stride: {1}, groups: {1}'
self.conv = DeformConv2d(
in_channels,
out_channels,
kernel_size,
padding=padding,
stride=stride,
groups=groups,
bias=bias)
else:
self.conv = nn.Conv2d(
in_channels,
out_channels,
kernel_size,
padding=dilation,
dilation=dilation)
def forward(self, x: Tensor, offset: Tensor) -> Tensor:
"""Forward function."""
if self.adapt_type == 'offset':
N, _, H, W = x.shape
assert offset is not None
assert H * W == offset.shape[1]
# reshape [N, NA, 18] to (N, 18, H, W)
offset = offset.permute(0, 2, 1).reshape(N, -1, H, W)
offset = offset.contiguous()
x = self.conv(x, offset)
else:
assert offset is None
x = self.conv(x)
return x
[docs]
@MODELS.register_module()
class StageCascadeRPNHead(RPNHead):
"""Stage of CascadeRPNHead.
Args:
in_channels (int): Number of channels in the input feature map.
anchor_generator (:obj:`ConfigDict` or dict): anchor generator config.
adapt_cfg (:obj:`ConfigDict` or dict): adaptation config.
bridged_feature (bool): whether update rpn feature. Defaults to False.
with_cls (bool): whether use classification branch. Defaults to True.
init_cfg :obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
list[dict], optional): Initialization config dict.
Defaults to None.
"""
def __init__(self,
in_channels: int,
anchor_generator: ConfigType = dict(
type='AnchorGenerator',
scales=[8],
ratios=[1.0],
strides=[4, 8, 16, 32, 64]),
adapt_cfg: ConfigType = dict(type='dilation', dilation=3),
bridged_feature: bool = False,
with_cls: bool = True,
init_cfg: OptMultiConfig = None,
**kwargs) -> None:
self.with_cls = with_cls
self.anchor_strides = anchor_generator['strides']
self.anchor_scales = anchor_generator['scales']
self.bridged_feature = bridged_feature
self.adapt_cfg = adapt_cfg
super().__init__(
in_channels=in_channels,
anchor_generator=anchor_generator,
init_cfg=init_cfg,
**kwargs)
# override sampling and sampler
if self.train_cfg:
self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
# use PseudoSampler when sampling is False
if self.train_cfg.get('sampler', None) is not None:
self.sampler = TASK_UTILS.build(
self.train_cfg['sampler'], default_args=dict(context=self))
else:
self.sampler = PseudoSampler(context=self)
if init_cfg is None:
self.init_cfg = dict(
type='Normal', std=0.01, override=[dict(name='rpn_reg')])
if self.with_cls:
self.init_cfg['override'].append(dict(name='rpn_cls'))
def _init_layers(self) -> None:
"""Init layers of a CascadeRPN stage."""
adapt_cfg = copy.deepcopy(self.adapt_cfg)
adapt_cfg['adapt_type'] = adapt_cfg.pop('type')
self.rpn_conv = AdaptiveConv(self.in_channels, self.feat_channels,
**adapt_cfg)
if self.with_cls:
self.rpn_cls = nn.Conv2d(self.feat_channels,
self.num_anchors * self.cls_out_channels,
1)
self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1)
self.relu = nn.ReLU(inplace=True)
[docs]
def forward_single(self, x: Tensor, offset: Tensor) -> Tuple[Tensor]:
"""Forward function of single scale."""
bridged_x = x
x = self.relu(self.rpn_conv(x, offset))
if self.bridged_feature:
bridged_x = x # update feature
cls_score = self.rpn_cls(x) if self.with_cls else None
bbox_pred = self.rpn_reg(x)
return bridged_x, cls_score, bbox_pred
[docs]
def forward(
self,
feats: List[Tensor],
offset_list: Optional[List[Tensor]] = None) -> Tuple[List[Tensor]]:
"""Forward function."""
if offset_list is None:
offset_list = [None for _ in range(len(feats))]
return multi_apply(self.forward_single, feats, offset_list)
def _region_targets_single(self, flat_anchors: Tensor, valid_flags: Tensor,
gt_instances: InstanceData, img_meta: dict,
gt_instances_ignore: InstanceData,
featmap_sizes: List[Tuple[int, int]],
num_level_anchors: List[int]) -> tuple:
"""Get anchor targets based on region for single level.
Args:
flat_anchors (Tensor): Multi-level anchors of the image, which are
concatenated into a single tensor of shape (num_anchors, 4)
valid_flags (Tensor): Multi level valid flags of the image,
which are concatenated into a single tensor of
shape (num_anchors, ).
gt_instances (:obj:`InstanceData`): Ground truth of instance
annotations. It should includes ``bboxes`` and ``labels``
attributes.
img_meta (dict): Meta information for current image.
gt_instances_ignore (:obj:`InstanceData`, optional): Instances
to be ignored during training. It includes ``bboxes`` attribute
data that is ignored during training and testing.
Defaults to None.
featmap_sizes (list[Tuple[int, int]]): Feature map size each level.
num_level_anchors (list[int]): The number of anchors in each level.
Returns:
tuple:
- labels (Tensor): Labels of each level.
- label_weights (Tensor): Label weights of each level.
- bbox_targets (Tensor): BBox targets of each level.
- bbox_weights (Tensor): BBox weights of each level.
- pos_inds (Tensor): positive samples indexes.
- neg_inds (Tensor): negative samples indexes.
- sampling_result (:obj:`SamplingResult`): Sampling results.
"""
pred_instances = InstanceData()
pred_instances.priors = flat_anchors
pred_instances.valid_flags = valid_flags
assign_result = self.assigner.assign(
pred_instances,
gt_instances,
img_meta,
featmap_sizes,
num_level_anchors,
self.anchor_scales[0],
self.anchor_strides,
gt_instances_ignore=gt_instances_ignore,
allowed_border=self.train_cfg['allowed_border'])
sampling_result = self.sampler.sample(assign_result, pred_instances,
gt_instances)
num_anchors = flat_anchors.shape[0]
bbox_targets = torch.zeros_like(flat_anchors)
bbox_weights = torch.zeros_like(flat_anchors)
labels = flat_anchors.new_zeros(num_anchors, dtype=torch.long)
label_weights = flat_anchors.new_zeros(num_anchors, dtype=torch.float)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
if len(pos_inds) > 0:
if not self.reg_decoded_bbox:
pos_bbox_targets = self.bbox_coder.encode(
sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
else:
pos_bbox_targets = sampling_result.pos_gt_bboxes
bbox_targets[pos_inds, :] = pos_bbox_targets
bbox_weights[pos_inds, :] = 1.0
labels[pos_inds] = sampling_result.pos_gt_labels
if self.train_cfg['pos_weight'] <= 0:
label_weights[pos_inds] = 1.0
else:
label_weights[pos_inds] = self.train_cfg['pos_weight']
if len(neg_inds) > 0:
label_weights[neg_inds] = 1.0
return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
neg_inds, sampling_result)
[docs]
def region_targets(
self,
anchor_list: List[List[Tensor]],
valid_flag_list: List[List[Tensor]],
featmap_sizes: List[Tuple[int, int]],
batch_gt_instances: InstanceList,
batch_img_metas: List[dict],
batch_gt_instances_ignore: OptInstanceList = None,
return_sampling_results: bool = False,
) -> tuple:
"""Compute regression and classification targets for anchors when using
RegionAssigner.
Args:
anchor_list (list[list[Tensor]]): Multi level anchors of each
image.
valid_flag_list (list[list[Tensor]]): Multi level valid flags of
each image.
featmap_sizes (list[Tuple[int, int]]): Feature map size each level.
batch_gt_instances (list[:obj:`InstanceData`]): Batch of
gt_instance. It usually includes ``bboxes`` and ``labels``
attributes.
batch_img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
Batch of gt_instances_ignore. It includes ``bboxes`` attribute
data that is ignored during training and testing.
Defaults to None.
Returns:
tuple:
- labels_list (list[Tensor]): Labels of each level.
- label_weights_list (list[Tensor]): Label weights of each
level.
- bbox_targets_list (list[Tensor]): BBox targets of each level.
- bbox_weights_list (list[Tensor]): BBox weights of each level.
- avg_factor (int): Average factor that is used to average
the loss. When using sampling method, avg_factor is usually
the sum of positive and negative priors. When using
``PseudoSampler``, ``avg_factor`` is usually equal to the
number of positive priors.
"""
num_imgs = len(batch_img_metas)
assert len(anchor_list) == len(valid_flag_list) == num_imgs
if batch_gt_instances_ignore is None:
batch_gt_instances_ignore = [None] * num_imgs
# anchor number of multi levels
num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
# concat all level anchors to a single tensor
concat_anchor_list = []
concat_valid_flag_list = []
for i in range(num_imgs):
assert len(anchor_list[i]) == len(valid_flag_list[i])
concat_anchor_list.append(torch.cat(anchor_list[i]))
concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
# compute targets for each image
(all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
pos_inds_list, neg_inds_list, sampling_results_list) = multi_apply(
self._region_targets_single,
concat_anchor_list,
concat_valid_flag_list,
batch_gt_instances,
batch_img_metas,
batch_gt_instances_ignore,
featmap_sizes=featmap_sizes,
num_level_anchors=num_level_anchors)
# no valid anchors
if any([labels is None for labels in all_labels]):
return None
# sampled anchors of all images
avg_factor = sum(
[results.avg_factor for results in sampling_results_list])
# split targets to a list w.r.t. multiple levels
labels_list = images_to_levels(all_labels, num_level_anchors)
label_weights_list = images_to_levels(all_label_weights,
num_level_anchors)
bbox_targets_list = images_to_levels(all_bbox_targets,
num_level_anchors)
bbox_weights_list = images_to_levels(all_bbox_weights,
num_level_anchors)
res = (labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, avg_factor)
if return_sampling_results:
res = res + (sampling_results_list, )
return res
[docs]
def get_targets(
self,
anchor_list: List[List[Tensor]],
valid_flag_list: List[List[Tensor]],
featmap_sizes: List[Tuple[int, int]],
batch_gt_instances: InstanceList,
batch_img_metas: List[dict],
batch_gt_instances_ignore: OptInstanceList = None,
return_sampling_results: bool = False,
) -> tuple:
"""Compute regression and classification targets for anchors.
Args:
anchor_list (list[list[Tensor]]): Multi level anchors of each
image.
valid_flag_list (list[list[Tensor]]): Multi level valid flags of
each image.
featmap_sizes (list[Tuple[int, int]]): Feature map size each level.
batch_gt_instances (list[:obj:`InstanceData`]): Batch of
gt_instance. It usually includes ``bboxes`` and ``labels``
attributes.
batch_img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
Batch of gt_instances_ignore. It includes ``bboxes`` attribute
data that is ignored during training and testing.
Defaults to None.
return_sampling_results (bool): Whether to return the sampling
results. Defaults to False.
Returns:
tuple:
- labels_list (list[Tensor]): Labels of each level.
- label_weights_list (list[Tensor]): Label weights of each
level.
- bbox_targets_list (list[Tensor]): BBox targets of each level.
- bbox_weights_list (list[Tensor]): BBox weights of each level.
- avg_factor (int): Average factor that is used to average
the loss. When using sampling method, avg_factor is usually
the sum of positive and negative priors. When using
``PseudoSampler``, ``avg_factor`` is usually equal to the
number of positive priors.
"""
if isinstance(self.assigner, RegionAssigner):
cls_reg_targets = self.region_targets(
anchor_list,
valid_flag_list,
featmap_sizes,
batch_gt_instances,
batch_img_metas,
batch_gt_instances_ignore=batch_gt_instances_ignore,
return_sampling_results=return_sampling_results)
else:
cls_reg_targets = super().get_targets(
anchor_list,
valid_flag_list,
batch_gt_instances,
batch_img_metas,
batch_gt_instances_ignore=batch_gt_instances_ignore,
return_sampling_results=return_sampling_results)
return cls_reg_targets
[docs]
def anchor_offset(self, anchor_list: List[List[Tensor]],
anchor_strides: List[int],
featmap_sizes: List[Tuple[int, int]]) -> List[Tensor]:
""" Get offset for deformable conv based on anchor shape
NOTE: currently support deformable kernel_size=3 and dilation=1
Args:
anchor_list (list[list[tensor])): [NI, NLVL, NA, 4] list of
multi-level anchors
anchor_strides (list[int]): anchor stride of each level
Returns:
list[tensor]: offset of DeformConv kernel with shapes of
[NLVL, NA, 2, 18].
"""
def _shape_offset(anchors, stride, ks=3, dilation=1):
# currently support kernel_size=3 and dilation=1
assert ks == 3 and dilation == 1
pad = (ks - 1) // 2
idx = torch.arange(-pad, pad + 1, dtype=dtype, device=device)
yy, xx = torch.meshgrid(idx, idx) # return order matters
xx = xx.reshape(-1)
yy = yy.reshape(-1)
w = (anchors[:, 2] - anchors[:, 0]) / stride
h = (anchors[:, 3] - anchors[:, 1]) / stride
w = w / (ks - 1) - dilation
h = h / (ks - 1) - dilation
offset_x = w[:, None] * xx # (NA, ks**2)
offset_y = h[:, None] * yy # (NA, ks**2)
return offset_x, offset_y
def _ctr_offset(anchors, stride, featmap_size):
feat_h, feat_w = featmap_size
assert len(anchors) == feat_h * feat_w
x = (anchors[:, 0] + anchors[:, 2]) * 0.5
y = (anchors[:, 1] + anchors[:, 3]) * 0.5
# compute centers on feature map
x = x / stride
y = y / stride
# compute predefine centers
xx = torch.arange(0, feat_w, device=anchors.device)
yy = torch.arange(0, feat_h, device=anchors.device)
yy, xx = torch.meshgrid(yy, xx)
xx = xx.reshape(-1).type_as(x)
yy = yy.reshape(-1).type_as(y)
offset_x = x - xx # (NA, )
offset_y = y - yy # (NA, )
return offset_x, offset_y
num_imgs = len(anchor_list)
num_lvls = len(anchor_list[0])
dtype = anchor_list[0][0].dtype
device = anchor_list[0][0].device
num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
offset_list = []
for i in range(num_imgs):
mlvl_offset = []
for lvl in range(num_lvls):
c_offset_x, c_offset_y = _ctr_offset(anchor_list[i][lvl],
anchor_strides[lvl],
featmap_sizes[lvl])
s_offset_x, s_offset_y = _shape_offset(anchor_list[i][lvl],
anchor_strides[lvl])
# offset = ctr_offset + shape_offset
offset_x = s_offset_x + c_offset_x[:, None]
offset_y = s_offset_y + c_offset_y[:, None]
# offset order (y0, x0, y1, x2, .., y8, x8, y9, x9)
offset = torch.stack([offset_y, offset_x], dim=-1)
offset = offset.reshape(offset.size(0), -1) # [NA, 2*ks**2]
mlvl_offset.append(offset)
offset_list.append(torch.cat(mlvl_offset)) # [totalNA, 2*ks**2]
offset_list = images_to_levels(offset_list, num_level_anchors)
return offset_list
[docs]
def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
anchors: Tensor, labels: Tensor,
label_weights: Tensor, bbox_targets: Tensor,
bbox_weights: Tensor, avg_factor: int) -> tuple:
"""Loss function on single scale."""
# classification loss
if self.with_cls:
labels = labels.reshape(-1)
label_weights = label_weights.reshape(-1)
cls_score = cls_score.permute(0, 2, 3,
1).reshape(-1, self.cls_out_channels)
loss_cls = self.loss_cls(
cls_score, labels, label_weights, avg_factor=avg_factor)
# regression loss
bbox_targets = bbox_targets.reshape(-1, 4)
bbox_weights = bbox_weights.reshape(-1, 4)
bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
if self.reg_decoded_bbox:
# When the regression loss (e.g. `IouLoss`, `GIouLoss`)
# is applied directly on the decoded bounding boxes, it
# decodes the already encoded coordinates to absolute format.
anchors = anchors.reshape(-1, 4)
bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
loss_reg = self.loss_bbox(
bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor)
if self.with_cls:
return loss_cls, loss_reg
return None, loss_reg
[docs]
def loss_by_feat(
self,
anchor_list: List[List[Tensor]],
valid_flag_list: List[List[Tensor]],
cls_scores: List[Tensor],
bbox_preds: List[Tensor],
batch_gt_instances: InstanceList,
batch_img_metas: List[dict],
batch_gt_instances_ignore: OptInstanceList = None
) -> Dict[str, Tensor]:
"""Compute losses of the head.
Args:
anchor_list (list[list[Tensor]]): Multi level anchors of each
image.
valid_flag_list (list[list[Tensor]]): Multi level valid flags of
each image. The outer list indicates images, and the inner list
corresponds to feature levels of the image. Each element of
the inner list is a tensor of shape (num_anchors, )
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
batch_gt_instances (list[:obj:`InstanceData`]): Batch of
gt_instance. It usually includes ``bboxes`` and ``labels``
attributes.
batch_img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
Batch of gt_instances_ignore. It includes ``bboxes`` attribute
data that is ignored during training and testing.
Defaults to None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds]
cls_reg_targets = self.get_targets(
anchor_list,
valid_flag_list,
featmap_sizes,
batch_gt_instances,
batch_img_metas,
batch_gt_instances_ignore=batch_gt_instances_ignore,
return_sampling_results=True)
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
avg_factor, sampling_results_list) = cls_reg_targets
if not sampling_results_list[0].avg_factor_with_neg:
# 200 is hard-coded average factor,
# which follows guided anchoring.
avg_factor = sum([label.numel() for label in labels_list]) / 200.0
# change per image, per level anchor_list to per_level, per_image
mlvl_anchor_list = list(zip(*anchor_list))
# concat mlvl_anchor_list
mlvl_anchor_list = [
torch.cat(anchors, dim=0) for anchors in mlvl_anchor_list
]
losses = multi_apply(
self.loss_by_feat_single,
cls_scores,
bbox_preds,
mlvl_anchor_list,
labels_list,
label_weights_list,
bbox_targets_list,
bbox_weights_list,
avg_factor=avg_factor)
if self.with_cls:
return dict(loss_rpn_cls=losses[0], loss_rpn_reg=losses[1])
return dict(loss_rpn_reg=losses[1])
[docs]
def predict_by_feat(self,
anchor_list: List[List[Tensor]],
cls_scores: List[Tensor],
bbox_preds: List[Tensor],
batch_img_metas: List[dict],
cfg: Optional[ConfigDict] = None,
rescale: bool = False) -> InstanceList:
"""Get proposal predict. Overriding to enable input ``anchor_list``
from outside.
Args:
anchor_list (list[list[Tensor]]): Multi level anchors of each
image.
cls_scores (list[Tensor]): Classification scores for all
scale levels, each is a 4D-tensor, has shape
(batch_size, num_priors * num_classes, H, W).
bbox_preds (list[Tensor]): Box energies / deltas for all
scale levels, each is a 4D-tensor, has shape
(batch_size, num_priors * 4, H, W).
batch_img_metas (list[dict], Optional): Image meta info.
cfg (:obj:`ConfigDict`, optional): Test / postprocessing
configuration, if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Defaults to False.
Returns:
list[:obj:`InstanceData`]: Object detection results of each image
after the post process. Each item usually contains following keys.
- scores (Tensor): Classification scores, has a shape
(num_instance, )
- labels (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bboxes (Tensor): Has a shape (num_instances, 4),
the last dimension 4 arrange as (x1, y1, x2, y2).
"""
assert len(cls_scores) == len(bbox_preds)
result_list = []
for img_id in range(len(batch_img_metas)):
cls_score_list = select_single_mlvl(cls_scores, img_id)
bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
proposals = self._predict_by_feat_single(
cls_scores=cls_score_list,
bbox_preds=bbox_pred_list,
mlvl_anchors=anchor_list[img_id],
img_meta=batch_img_metas[img_id],
cfg=cfg,
rescale=rescale)
result_list.append(proposals)
return result_list
def _predict_by_feat_single(self,
cls_scores: List[Tensor],
bbox_preds: List[Tensor],
mlvl_anchors: List[Tensor],
img_meta: dict,
cfg: ConfigDict,
rescale: bool = False) -> InstanceData:
"""Transform outputs of a single image into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores from all scale
levels of a single image, each item has shape
(num_anchors * num_classes, H, W).
bbox_preds (list[Tensor]): Box energies / deltas from
all scale levels of a single image, each item has
shape (num_anchors * 4, H, W).
mlvl_anchors (list[Tensor]): Box reference from all scale
levels of a single image, each item has shape
(num_total_anchors, 4).
img_shape (tuple[int]): Shape of the input image,
(height, width, 3).
scale_factor (ndarray): Scale factor of the image arange as
(w_scale, h_scale, w_scale, h_scale).
cfg (:obj:`ConfigDict`): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Defaults to False.
Returns:
:obj:`InstanceData`: Detection results of each image
after the post process.
Each item usually contains following keys.
- scores (Tensor): Classification scores, has a shape
(num_instance, )
- labels (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bboxes (Tensor): Has a shape (num_instances, 4),
the last dimension 4 arrange as (x1, y1, x2, y2).
"""
cfg = self.test_cfg if cfg is None else cfg
cfg = copy.deepcopy(cfg)
# bboxes from different level should be independent during NMS,
# level_ids are used as labels for batched NMS to separate them
level_ids = []
mlvl_scores = []
mlvl_bbox_preds = []
mlvl_valid_anchors = []
nms_pre = cfg.get('nms_pre', -1)
for idx in range(len(cls_scores)):
rpn_cls_score = cls_scores[idx]
rpn_bbox_pred = bbox_preds[idx]
assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
if self.use_sigmoid_cls:
rpn_cls_score = rpn_cls_score.reshape(-1)
scores = rpn_cls_score.sigmoid()
else:
rpn_cls_score = rpn_cls_score.reshape(-1, 2)
# We set FG labels to [0, num_class-1] and BG label to
# num_class in RPN head since mmdet v2.5, which is unified to
# be consistent with other head since mmdet v2.0. In mmdet v2.0
# to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
scores = rpn_cls_score.softmax(dim=1)[:, 0]
rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
anchors = mlvl_anchors[idx]
if 0 < nms_pre < scores.shape[0]:
# sort is faster than topk
# _, topk_inds = scores.topk(cfg.nms_pre)
ranked_scores, rank_inds = scores.sort(descending=True)
topk_inds = rank_inds[:nms_pre]
scores = ranked_scores[:nms_pre]
rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
anchors = anchors[topk_inds, :]
mlvl_scores.append(scores)
mlvl_bbox_preds.append(rpn_bbox_pred)
mlvl_valid_anchors.append(anchors)
level_ids.append(
scores.new_full((scores.size(0), ), idx, dtype=torch.long))
anchors = torch.cat(mlvl_valid_anchors)
rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
bboxes = self.bbox_coder.decode(
anchors, rpn_bbox_pred, max_shape=img_meta['img_shape'])
proposals = InstanceData()
proposals.bboxes = bboxes
proposals.scores = torch.cat(mlvl_scores)
proposals.level_ids = torch.cat(level_ids)
return self._bbox_post_process(
results=proposals, cfg=cfg, rescale=rescale, img_meta=img_meta)
[docs]
def refine_bboxes(self, anchor_list: List[List[Tensor]],
bbox_preds: List[Tensor],
img_metas: List[dict]) -> List[List[Tensor]]:
"""Refine bboxes through stages."""
num_levels = len(bbox_preds)
new_anchor_list = []
for img_id in range(len(img_metas)):
mlvl_anchors = []
for i in range(num_levels):
bbox_pred = bbox_preds[i][img_id].detach()
bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
img_shape = img_metas[img_id]['img_shape']
bboxes = self.bbox_coder.decode(anchor_list[img_id][i],
bbox_pred, img_shape)
mlvl_anchors.append(bboxes)
new_anchor_list.append(mlvl_anchors)
return new_anchor_list
[docs]
def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict:
"""Perform forward propagation and loss calculation of the detection
head on the features of the upstream network.
Args:
x (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
batch_data_samples (List[:obj:`DetDataSample`]): The Data
Samples. It usually includes information such as
`gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
Returns:
dict: A dictionary of loss components.
"""
outputs = unpack_gt_instances(batch_data_samples)
batch_gt_instances, _, batch_img_metas = outputs
featmap_sizes = [featmap.size()[-2:] for featmap in x]
device = x[0].device
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, batch_img_metas, device=device)
if self.adapt_cfg['type'] == 'offset':
offset_list = self.anchor_offset(anchor_list, self.anchor_strides,
featmap_sizes)
else:
offset_list = None
x, cls_score, bbox_pred = self(x, offset_list)
rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, bbox_pred,
batch_gt_instances, batch_img_metas)
losses = self.loss_by_feat(*rpn_loss_inputs)
return losses
[docs]
def loss_and_predict(
self,
x: Tuple[Tensor],
batch_data_samples: SampleList,
proposal_cfg: Optional[ConfigDict] = None,
) -> Tuple[dict, InstanceList]:
"""Perform forward propagation of the head, then calculate loss and
predictions from the features and data samples.
Args:
x (tuple[Tensor]): Features from FPN.
batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
the meta information of each image and corresponding
annotations.
proposal_cfg (:obj`ConfigDict`, optional): Test / postprocessing
configuration, if None, test_cfg would be used.
Defaults to None.
Returns:
tuple: the return value is a tuple contains:
- losses: (dict[str, Tensor]): A dictionary of loss components.
- predictions (list[:obj:`InstanceData`]): Detection
results of each image after the post process.
"""
outputs = unpack_gt_instances(batch_data_samples)
batch_gt_instances, _, batch_img_metas = outputs
featmap_sizes = [featmap.size()[-2:] for featmap in x]
device = x[0].device
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, batch_img_metas, device=device)
if self.adapt_cfg['type'] == 'offset':
offset_list = self.anchor_offset(anchor_list, self.anchor_strides,
featmap_sizes)
else:
offset_list = None
x, cls_score, bbox_pred = self(x, offset_list)
rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, bbox_pred,
batch_gt_instances, batch_img_metas)
losses = self.loss_by_feat(*rpn_loss_inputs)
predictions = self.predict_by_feat(
anchor_list,
cls_score,
bbox_pred,
batch_img_metas=batch_img_metas,
cfg=proposal_cfg)
return losses, predictions
[docs]
def predict(self,
x: Tuple[Tensor],
batch_data_samples: SampleList,
rescale: bool = False) -> InstanceList:
"""Perform forward propagation of the detection head and predict
detection results on the features of the upstream network.
Args:
x (tuple[Tensor]): Multi-level features from the
upstream network, each is a 4D-tensor.
batch_data_samples (List[:obj:`DetDataSample`]): The Data
Samples. It usually includes information such as
`gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
Returns:
list[obj:`InstanceData`]: Detection results of each image
after the post process.
"""
batch_img_metas = [
data_samples.metainfo for data_samples in batch_data_samples
]
featmap_sizes = [featmap.size()[-2:] for featmap in x]
device = x[0].device
anchor_list, _ = self.get_anchors(
featmap_sizes, batch_img_metas, device=device)
if self.adapt_cfg['type'] == 'offset':
offset_list = self.anchor_offset(anchor_list, self.anchor_strides,
featmap_sizes)
else:
offset_list = None
x, cls_score, bbox_pred = self(x, offset_list)
predictions = self.stages[-1].predict_by_feat(
anchor_list,
cls_score,
bbox_pred,
batch_img_metas=batch_img_metas,
rescale=rescale)
return predictions
[docs]
@MODELS.register_module()
class CascadeRPNHead(BaseDenseHead):
"""The CascadeRPNHead will predict more accurate region proposals, which is
required for two-stage detectors (such as Fast/Faster R-CNN). CascadeRPN
consists of a sequence of RPNStage to progressively improve the accuracy of
the detected proposals.
More details can be found in ``https://arxiv.org/abs/1909.06720``.
Args:
num_stages (int): number of CascadeRPN stages.
stages (list[:obj:`ConfigDict` or dict]): list of configs to build
the stages.
train_cfg (list[:obj:`ConfigDict` or dict]): list of configs at
training time each stage.
test_cfg (:obj:`ConfigDict` or dict): config at testing time.
init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
list[dict]): Initialization config dict.
"""
def __init__(self,
num_classes: int,
num_stages: int,
stages: List[ConfigType],
train_cfg: List[ConfigType],
test_cfg: ConfigType,
init_cfg: OptMultiConfig = None) -> None:
super().__init__(init_cfg=init_cfg)
assert num_classes == 1, 'Only support num_classes == 1'
assert num_stages == len(stages)
self.num_stages = num_stages
# Be careful! Pretrained weights cannot be loaded when use
# nn.ModuleList
self.stages = ModuleList()
for i in range(len(stages)):
train_cfg_i = train_cfg[i] if train_cfg is not None else None
stages[i].update(train_cfg=train_cfg_i)
stages[i].update(test_cfg=test_cfg)
self.stages.append(MODELS.build(stages[i]))
self.train_cfg = train_cfg
self.test_cfg = test_cfg
[docs]
def predict_by_feat(self):
"""predict_by_feat() is implemented in StageCascadeRPNHead."""
pass
[docs]
def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict:
"""Perform forward propagation and loss calculation of the detection
head on the features of the upstream network.
Args:
x (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
batch_data_samples (List[:obj:`DetDataSample`]): The Data
Samples. It usually includes information such as
`gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
Returns:
dict: A dictionary of loss components.
"""
outputs = unpack_gt_instances(batch_data_samples)
batch_gt_instances, _, batch_img_metas = outputs
featmap_sizes = [featmap.size()[-2:] for featmap in x]
device = x[0].device
anchor_list, valid_flag_list = self.stages[0].get_anchors(
featmap_sizes, batch_img_metas, device=device)
losses = dict()
for i in range(self.num_stages):
stage = self.stages[i]
if stage.adapt_cfg['type'] == 'offset':
offset_list = stage.anchor_offset(anchor_list,
stage.anchor_strides,
featmap_sizes)
else:
offset_list = None
x, cls_score, bbox_pred = stage(x, offset_list)
rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score,
bbox_pred, batch_gt_instances, batch_img_metas)
stage_loss = stage.loss_by_feat(*rpn_loss_inputs)
for name, value in stage_loss.items():
losses['s{}.{}'.format(i, name)] = value
# refine boxes
if i < self.num_stages - 1:
anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
batch_img_metas)
return losses
[docs]
def loss_and_predict(
self,
x: Tuple[Tensor],
batch_data_samples: SampleList,
proposal_cfg: Optional[ConfigDict] = None,
) -> Tuple[dict, InstanceList]:
"""Perform forward propagation of the head, then calculate loss and
predictions from the features and data samples.
Args:
x (tuple[Tensor]): Features from FPN.
batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
the meta information of each image and corresponding
annotations.
proposal_cfg (ConfigDict, optional): Test / postprocessing
configuration, if None, test_cfg would be used.
Defaults to None.
Returns:
tuple: the return value is a tuple contains:
- losses: (dict[str, Tensor]): A dictionary of loss components.
- predictions (list[:obj:`InstanceData`]): Detection
results of each image after the post process.
"""
outputs = unpack_gt_instances(batch_data_samples)
batch_gt_instances, _, batch_img_metas = outputs
featmap_sizes = [featmap.size()[-2:] for featmap in x]
device = x[0].device
anchor_list, valid_flag_list = self.stages[0].get_anchors(
featmap_sizes, batch_img_metas, device=device)
losses = dict()
for i in range(self.num_stages):
stage = self.stages[i]
if stage.adapt_cfg['type'] == 'offset':
offset_list = stage.anchor_offset(anchor_list,
stage.anchor_strides,
featmap_sizes)
else:
offset_list = None
x, cls_score, bbox_pred = stage(x, offset_list)
rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score,
bbox_pred, batch_gt_instances, batch_img_metas)
stage_loss = stage.loss_by_feat(*rpn_loss_inputs)
for name, value in stage_loss.items():
losses['s{}.{}'.format(i, name)] = value
# refine boxes
if i < self.num_stages - 1:
anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
batch_img_metas)
predictions = self.stages[-1].predict_by_feat(
anchor_list,
cls_score,
bbox_pred,
batch_img_metas=batch_img_metas,
cfg=proposal_cfg)
return losses, predictions
[docs]
def predict(self,
x: Tuple[Tensor],
batch_data_samples: SampleList,
rescale: bool = False) -> InstanceList:
"""Perform forward propagation of the detection head and predict
detection results on the features of the upstream network.
Args:
x (tuple[Tensor]): Multi-level features from the
upstream network, each is a 4D-tensor.
batch_data_samples (List[:obj:`DetDataSample`]): The Data
Samples. It usually includes information such as
`gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
Returns:
list[obj:`InstanceData`]: Detection results of each image
after the post process.
"""
batch_img_metas = [
data_samples.metainfo for data_samples in batch_data_samples
]
featmap_sizes = [featmap.size()[-2:] for featmap in x]
device = x[0].device
anchor_list, _ = self.stages[0].get_anchors(
featmap_sizes, batch_img_metas, device=device)
for i in range(self.num_stages):
stage = self.stages[i]
if stage.adapt_cfg['type'] == 'offset':
offset_list = stage.anchor_offset(anchor_list,
stage.anchor_strides,
featmap_sizes)
else:
offset_list = None
x, cls_score, bbox_pred = stage(x, offset_list)
if i < self.num_stages - 1:
anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
batch_img_metas)
predictions = self.stages[-1].predict_by_feat(
anchor_list,
cls_score,
bbox_pred,
batch_img_metas=batch_img_metas,
rescale=rescale)
return predictions