|
|
|
from typing import List, Tuple |
|
|
|
import numpy as np |
|
import torch |
|
import torch.nn as nn |
|
from mmcv.cnn import Conv2d, Linear, MaxPool2d |
|
from mmengine.config import ConfigDict |
|
from mmengine.model import BaseModule |
|
from mmengine.structures import InstanceData |
|
from torch import Tensor |
|
from torch.nn.modules.utils import _pair |
|
|
|
from mmdet.models.task_modules.samplers import SamplingResult |
|
from mmdet.registry import MODELS |
|
from mmdet.utils import ConfigType, InstanceList, OptMultiConfig |
|
|
|
|
|
@MODELS.register_module() |
|
class MaskIoUHead(BaseModule): |
|
"""Mask IoU Head. |
|
|
|
This head predicts the IoU of predicted masks and corresponding gt masks. |
|
|
|
Args: |
|
num_convs (int): The number of convolution layers. Defaults to 4. |
|
num_fcs (int): The number of fully connected layers. Defaults to 2. |
|
roi_feat_size (int): RoI feature size. Default to 14. |
|
in_channels (int): The channel number of inputs features. |
|
Defaults to 256. |
|
conv_out_channels (int): The feature channels of convolution layers. |
|
Defaults to 256. |
|
fc_out_channels (int): The feature channels of fully connected layers. |
|
Defaults to 1024. |
|
num_classes (int): Number of categories excluding the background |
|
category. Defaults to 80. |
|
loss_iou (:obj:`ConfigDict` or dict): IoU loss. |
|
init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ |
|
dict], optional): Initialization config dict. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
num_convs: int = 4, |
|
num_fcs: int = 2, |
|
roi_feat_size: int = 14, |
|
in_channels: int = 256, |
|
conv_out_channels: int = 256, |
|
fc_out_channels: int = 1024, |
|
num_classes: int = 80, |
|
loss_iou: ConfigType = dict(type='MSELoss', loss_weight=0.5), |
|
init_cfg: OptMultiConfig = [ |
|
dict(type='Kaiming', override=dict(name='convs')), |
|
dict(type='Caffe2Xavier', override=dict(name='fcs')), |
|
dict(type='Normal', std=0.01, override=dict(name='fc_mask_iou')) |
|
] |
|
) -> None: |
|
super().__init__(init_cfg=init_cfg) |
|
self.in_channels = in_channels |
|
self.conv_out_channels = conv_out_channels |
|
self.fc_out_channels = fc_out_channels |
|
self.num_classes = num_classes |
|
|
|
self.convs = nn.ModuleList() |
|
for i in range(num_convs): |
|
if i == 0: |
|
|
|
in_channels = self.in_channels + 1 |
|
else: |
|
in_channels = self.conv_out_channels |
|
stride = 2 if i == num_convs - 1 else 1 |
|
self.convs.append( |
|
Conv2d( |
|
in_channels, |
|
self.conv_out_channels, |
|
3, |
|
stride=stride, |
|
padding=1)) |
|
|
|
roi_feat_size = _pair(roi_feat_size) |
|
pooled_area = (roi_feat_size[0] // 2) * (roi_feat_size[1] // 2) |
|
self.fcs = nn.ModuleList() |
|
for i in range(num_fcs): |
|
in_channels = ( |
|
self.conv_out_channels * |
|
pooled_area if i == 0 else self.fc_out_channels) |
|
self.fcs.append(Linear(in_channels, self.fc_out_channels)) |
|
|
|
self.fc_mask_iou = Linear(self.fc_out_channels, self.num_classes) |
|
self.relu = nn.ReLU() |
|
self.max_pool = MaxPool2d(2, 2) |
|
self.loss_iou = MODELS.build(loss_iou) |
|
|
|
def forward(self, mask_feat: Tensor, mask_preds: Tensor) -> Tensor: |
|
"""Forward function. |
|
|
|
Args: |
|
mask_feat (Tensor): Mask features from upstream models. |
|
mask_preds (Tensor): Mask predictions from mask head. |
|
|
|
Returns: |
|
Tensor: Mask IoU predictions. |
|
""" |
|
mask_preds = mask_preds.sigmoid() |
|
mask_pred_pooled = self.max_pool(mask_preds.unsqueeze(1)) |
|
|
|
x = torch.cat((mask_feat, mask_pred_pooled), 1) |
|
|
|
for conv in self.convs: |
|
x = self.relu(conv(x)) |
|
x = x.flatten(1) |
|
for fc in self.fcs: |
|
x = self.relu(fc(x)) |
|
mask_iou = self.fc_mask_iou(x) |
|
return mask_iou |
|
|
|
def loss_and_target(self, mask_iou_pred: Tensor, mask_preds: Tensor, |
|
mask_targets: Tensor, |
|
sampling_results: List[SamplingResult], |
|
batch_gt_instances: InstanceList, |
|
rcnn_train_cfg: ConfigDict) -> dict: |
|
"""Calculate the loss and targets of MaskIoUHead. |
|
|
|
Args: |
|
mask_iou_pred (Tensor): Mask IoU predictions results, has shape |
|
(num_pos, num_classes) |
|
mask_preds (Tensor): Mask predictions from mask head, has shape |
|
(num_pos, mask_size, mask_size). |
|
mask_targets (Tensor): The ground truth masks assigned with |
|
predictions, has shape |
|
(num_pos, mask_size, mask_size). |
|
sampling_results (List[obj:SamplingResult]): Assign results of |
|
all images in a batch after sampling. |
|
batch_gt_instances (list[:obj:`InstanceData`]): Batch of |
|
gt_instance. It includes ``masks`` inside. |
|
rcnn_train_cfg (obj:`ConfigDict`): `train_cfg` of RCNN. |
|
|
|
Returns: |
|
dict: A dictionary of loss and targets components. |
|
The targets are only used for cascade rcnn. |
|
""" |
|
mask_iou_targets = self.get_targets( |
|
sampling_results=sampling_results, |
|
batch_gt_instances=batch_gt_instances, |
|
mask_preds=mask_preds, |
|
mask_targets=mask_targets, |
|
rcnn_train_cfg=rcnn_train_cfg) |
|
|
|
pos_inds = mask_iou_targets > 0 |
|
if pos_inds.sum() > 0: |
|
loss_mask_iou = self.loss_iou(mask_iou_pred[pos_inds], |
|
mask_iou_targets[pos_inds]) |
|
else: |
|
loss_mask_iou = mask_iou_pred.sum() * 0 |
|
return dict(loss_mask_iou=loss_mask_iou) |
|
|
|
def get_targets(self, sampling_results: List[SamplingResult], |
|
batch_gt_instances: InstanceList, mask_preds: Tensor, |
|
mask_targets: Tensor, |
|
rcnn_train_cfg: ConfigDict) -> Tensor: |
|
"""Compute target of mask IoU. |
|
|
|
Mask IoU target is the IoU of the predicted mask (inside a bbox) and |
|
the gt mask of corresponding gt mask (the whole instance). |
|
The intersection area is computed inside the bbox, and the gt mask area |
|
is computed with two steps, firstly we compute the gt area inside the |
|
bbox, then divide it by the area ratio of gt area inside the bbox and |
|
the gt area of the whole instance. |
|
|
|
Args: |
|
sampling_results (list[:obj:`SamplingResult`]): sampling results. |
|
batch_gt_instances (list[:obj:`InstanceData`]): Batch of |
|
gt_instance. It includes ``masks`` inside. |
|
mask_preds (Tensor): Predicted masks of each positive proposal, |
|
shape (num_pos, h, w). |
|
mask_targets (Tensor): Gt mask of each positive proposal, |
|
binary map of the shape (num_pos, h, w). |
|
rcnn_train_cfg (obj:`ConfigDict`): Training config for R-CNN part. |
|
|
|
Returns: |
|
Tensor: mask iou target (length == num positive). |
|
""" |
|
pos_proposals = [res.pos_priors for res in sampling_results] |
|
pos_assigned_gt_inds = [ |
|
res.pos_assigned_gt_inds for res in sampling_results |
|
] |
|
gt_masks = [res.masks for res in batch_gt_instances] |
|
|
|
|
|
|
|
area_ratios = map(self._get_area_ratio, pos_proposals, |
|
pos_assigned_gt_inds, gt_masks) |
|
area_ratios = torch.cat(list(area_ratios)) |
|
assert mask_targets.size(0) == area_ratios.size(0) |
|
|
|
mask_preds = (mask_preds > rcnn_train_cfg.mask_thr_binary).float() |
|
mask_pred_areas = mask_preds.sum((-1, -2)) |
|
|
|
|
|
overlap_areas = (mask_preds * mask_targets).sum((-1, -2)) |
|
|
|
|
|
gt_full_areas = mask_targets.sum((-1, -2)) / (area_ratios + 1e-7) |
|
|
|
mask_iou_targets = overlap_areas / ( |
|
mask_pred_areas + gt_full_areas - overlap_areas) |
|
return mask_iou_targets |
|
|
|
def _get_area_ratio(self, pos_proposals: Tensor, |
|
pos_assigned_gt_inds: Tensor, |
|
gt_masks: InstanceData) -> Tensor: |
|
"""Compute area ratio of the gt mask inside the proposal and the gt |
|
mask of the corresponding instance. |
|
|
|
Args: |
|
pos_proposals (Tensor): Positive proposals, has shape (num_pos, 4). |
|
pos_assigned_gt_inds (Tensor): positive proposals assigned ground |
|
truth index. |
|
gt_masks (BitmapMask or PolygonMask): Gt masks (the whole instance) |
|
of each image, with the same shape of the input image. |
|
|
|
Returns: |
|
Tensor: The area ratio of the gt mask inside the proposal and the |
|
gt mask of the corresponding instance. |
|
""" |
|
num_pos = pos_proposals.size(0) |
|
if num_pos > 0: |
|
area_ratios = [] |
|
proposals_np = pos_proposals.cpu().numpy() |
|
pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy() |
|
|
|
gt_instance_mask_area = gt_masks.areas |
|
for i in range(num_pos): |
|
gt_mask = gt_masks[pos_assigned_gt_inds[i]] |
|
|
|
|
|
bbox = proposals_np[i, :].astype(np.int32) |
|
gt_mask_in_proposal = gt_mask.crop(bbox) |
|
|
|
ratio = gt_mask_in_proposal.areas[0] / ( |
|
gt_instance_mask_area[pos_assigned_gt_inds[i]] + 1e-7) |
|
area_ratios.append(ratio) |
|
area_ratios = torch.from_numpy(np.stack(area_ratios)).float().to( |
|
pos_proposals.device) |
|
else: |
|
area_ratios = pos_proposals.new_zeros((0, )) |
|
return area_ratios |
|
|
|
def predict_by_feat(self, mask_iou_preds: Tuple[Tensor], |
|
results_list: InstanceList) -> InstanceList: |
|
"""Predict the mask iou and calculate it into ``results.scores``. |
|
|
|
Args: |
|
mask_iou_preds (Tensor): Mask IoU predictions results, has shape |
|
(num_proposals, num_classes) |
|
results_list (list[:obj:`InstanceData`]): Detection results of |
|
each image. |
|
|
|
Returns: |
|
list[:obj:`InstanceData`]: Detection results of each image |
|
after the post process. Each item usually contains following keys. |
|
|
|
- scores (Tensor): Classification scores, has a shape |
|
(num_instance, ) |
|
- labels (Tensor): Labels of bboxes, has a shape |
|
(num_instances, ). |
|
- bboxes (Tensor): Has a shape (num_instances, 4), |
|
the last dimension 4 arrange as (x1, y1, x2, y2). |
|
- masks (Tensor): Has a shape (num_instances, H, W). |
|
""" |
|
assert len(mask_iou_preds) == len(results_list) |
|
for results, mask_iou_pred in zip(results_list, mask_iou_preds): |
|
labels = results.labels |
|
scores = results.scores |
|
results.scores = scores * mask_iou_pred[range(labels.size(0)), |
|
labels] |
|
return results_list |
|
|