# Copyright (c) OpenMMLab. All rights reserved. import copy from typing import Dict, List, Tuple from torch import Tensor import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import Linear from mmcv.ops.nms import nms from mmengine.model import bias_init_with_prob, constant_init from mmengine.structures import InstanceData from mmdet.registry import MODELS from mmdet.structures import SampleList from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh from mmdet.utils import InstanceList, OptInstanceList, reduce_mean from ..utils import multi_apply from ..layers import inverse_sigmoid from .detr_head import DETRHead # def adjust_bbox_to_pixel(bboxes: Tensor): # # 向下取整得到目标的左上角坐标 # adjusted_bboxes = torch.floor(bboxes) # # 向上取整得到目标的右下角坐标 # adjusted_bboxes[:, 2:] = torch.ceil(bboxes[:, 2:]) # return adjusted_bboxes def adjust_bbox_to_pixel(bboxes: Tensor): # 四舍五入取整坐标 adjusted_bboxes = torch.round(bboxes) return adjusted_bboxes @MODELS.register_module() class DINOHead(DETRHead): r"""Head of the DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection Code is modified from the `official github repo `_. More details can be found in the `paper `_ . """ def __init__(self, *args, share_pred_layer: bool = False, num_pred_layer: int = 6, as_two_stage: bool = False, pre_bboxes_round: bool = False, **kwargs) -> None: self.share_pred_layer = share_pred_layer self.num_pred_layer = num_pred_layer self.as_two_stage = as_two_stage self.pre_bboxes_round = pre_bboxes_round super().__init__(*args, **kwargs) def _init_layers(self) -> None: """Initialize classification branch and regression branch of head.""" fc_cls = Linear(self.embed_dims, self.cls_out_channels) reg_branch = [] for _ in range(self.num_reg_fcs): reg_branch.append(Linear(self.embed_dims, self.embed_dims)) reg_branch.append(nn.ReLU()) reg_branch.append(Linear(self.embed_dims, 4)) reg_branch = nn.Sequential(*reg_branch) if self.share_pred_layer: self.cls_branches = nn.ModuleList( [fc_cls for _ in range(self.num_pred_layer)]) self.reg_branches = nn.ModuleList( [reg_branch for _ in range(self.num_pred_layer)]) else: self.cls_branches = nn.ModuleList( [copy.deepcopy(fc_cls) for _ in range(self.num_pred_layer)]) self.reg_branches = nn.ModuleList([ copy.deepcopy(reg_branch) for _ in range(self.num_pred_layer) ]) def init_weights(self) -> None: """Initialize weights of the Deformable DETR head.""" if self.loss_cls.use_sigmoid: bias_init = bias_init_with_prob(0.01) for m in self.cls_branches: nn.init.constant_(m.bias, bias_init) for m in self.reg_branches: constant_init(m[-1], 0, bias=0) nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0) if self.as_two_stage: for m in self.reg_branches: nn.init.constant_(m[-1].bias.data[2:], 0.0) def forward(self, hidden_states: Tensor, references: List[Tensor]) -> Tuple[Tensor]: """Forward function. Args: hidden_states (Tensor): Hidden states output from each decoder layer, has shape (num_decoder_layers, bs, num_queries, dim). references (list[Tensor]): List of the reference from the decoder. The first reference is the `init_reference` (initial) and the other num_decoder_layers(6) references are `inter_references` (intermediate). The `init_reference` has shape (bs, num_queries, 4) when `as_two_stage` of the detector is `True`, otherwise (bs, num_queries, 2). Each `inter_reference` has shape (bs, num_queries, 4) when `with_box_refine` of the detector is `True`, otherwise (bs, num_queries, 2). The coordinates are arranged as (cx, cy) when the last dimension is 2, and (cx, cy, w, h) when it is 4. Returns: tuple[Tensor]: results of head containing the following tensor. - all_layers_outputs_classes (Tensor): Outputs from the classification head, has shape (num_decoder_layers, bs, num_queries, cls_out_channels). - all_layers_outputs_coords (Tensor): Sigmoid outputs from the regression head with normalized coordinate format (cx, cy, w, h), has shape (num_decoder_layers, bs, num_queries, 4) with the last dimension arranged as (cx, cy, w, h). """ all_layers_outputs_classes = [] all_layers_outputs_coords = [] for layer_id in range(hidden_states.shape[0]): reference = inverse_sigmoid(references[layer_id]) # NOTE The last reference will not be used. hidden_state = hidden_states[layer_id] outputs_class = self.cls_branches[layer_id](hidden_state) tmp_reg_preds = self.reg_branches[layer_id](hidden_state) if reference.shape[-1] == 4: # When `layer` is 0 and `as_two_stage` of the detector # is `True`, or when `layer` is greater than 0 and # `with_box_refine` of the detector is `True`. tmp_reg_preds += reference else: # When `layer` is 0 and `as_two_stage` of the detector # is `False`, or when `layer` is greater than 0 and # `with_box_refine` of the detector is `False`. assert reference.shape[-1] == 2 tmp_reg_preds[..., :2] += reference outputs_coord = tmp_reg_preds.sigmoid() all_layers_outputs_classes.append(outputs_class) all_layers_outputs_coords.append(outputs_coord) all_layers_outputs_classes = torch.stack(all_layers_outputs_classes) all_layers_outputs_coords = torch.stack(all_layers_outputs_coords) return all_layers_outputs_classes, all_layers_outputs_coords def loss(self, hidden_states: Tensor, references: List[Tensor], enc_outputs_class: Tensor, enc_outputs_coord: Tensor, batch_data_samples: SampleList, dn_meta: Dict[str, int]) -> dict: """Perform forward propagation and loss calculation of the detection head on the queries of the upstream network. Args: hidden_states (Tensor): Hidden states output from each decoder layer, has shape (num_decoder_layers, bs, num_queries_total, dim), where `num_queries_total` is the sum of `num_denoising_queries` and `num_matching_queries` when `self.training` is `True`, else `num_matching_queries`. references (list[Tensor]): List of the reference from the decoder. The first reference is the `init_reference` (initial) and the other num_decoder_layers(6) references are `inter_references` (intermediate). The `init_reference` has shape (bs, num_queries_total, 4) and each `inter_reference` has shape (bs, num_queries, 4) with the last dimension arranged as (cx, cy, w, h). enc_outputs_class (Tensor): The score of each point on encode feature map, has shape (bs, num_feat_points, cls_out_channels). enc_outputs_coord (Tensor): The proposal generate from the encode feature map, has shape (bs, num_feat_points, 4) with the last dimension arranged as (cx, cy, w, h). batch_data_samples (list[:obj:`DetDataSample`]): The Data Samples. It usually includes information such as `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and 'num_denoising_groups'. It will be used for split outputs of denoising and matching parts and loss calculation. Returns: dict: A dictionary of loss components. """ batch_gt_instances = [] batch_img_metas = [] for data_sample in batch_data_samples: batch_img_metas.append(data_sample.metainfo) batch_gt_instances.append(data_sample.gt_instances) outs = self(hidden_states, references) loss_inputs = outs + (enc_outputs_class, enc_outputs_coord, batch_gt_instances, batch_img_metas, dn_meta) losses = self.loss_by_feat(*loss_inputs) return losses def loss_by_feat( self, all_layers_cls_scores: Tensor, all_layers_bbox_preds: Tensor, enc_cls_scores: Tensor, enc_bbox_preds: Tensor, batch_gt_instances: InstanceList, batch_img_metas: List[dict], dn_meta: Dict[str, int], batch_gt_instances_ignore: OptInstanceList = None ) -> Dict[str, Tensor]: """Loss function. Args: all_layers_cls_scores (Tensor): Classification scores of all decoder layers, has shape (num_decoder_layers, bs, num_queries_total, cls_out_channels), where `num_queries_total` is the sum of `num_denoising_queries` and `num_matching_queries`. all_layers_bbox_preds (Tensor): Regression outputs of all decoder layers. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and has shape (num_decoder_layers, bs, num_queries_total, 4). enc_cls_scores (Tensor): The score of each point on encode feature map, has shape (bs, num_feat_points, cls_out_channels). enc_bbox_preds (Tensor): The proposal generate from the encode feature map, has shape (bs, num_feat_points, 4) with the last dimension arranged as (cx, cy, w, h). batch_gt_instances (list[:obj:`InstanceData`]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and 'num_denoising_groups'. It will be used for split outputs of denoising and matching parts and loss calculation. batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): Batch of gt_instances_ignore. It includes ``bboxes`` attribute data that is ignored during training and testing. Defaults to None. Returns: dict[str, Tensor]: A dictionary of loss components. """ # extract denoising and matching part of outputs (all_layers_matching_cls_scores, all_layers_matching_bbox_preds, all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) = \ self.split_outputs( all_layers_cls_scores, all_layers_bbox_preds, dn_meta) loss_dict = super().loss_by_feat( all_layers_matching_cls_scores, all_layers_matching_bbox_preds, batch_gt_instances, batch_img_metas, batch_gt_instances_ignore) # NOTE DETRHead.loss_by_feat but not DeformableDETRHead.loss_by_feat # is called, because the encoder loss calculations are different # between DINO and DeformableDETR. # loss of proposal generated from encode feature map. if enc_cls_scores is not None: # NOTE The enc_loss calculation of the DINO is # different from that of Deformable DETR. enc_loss_cls, enc_losses_bbox, enc_losses_iou = \ self.loss_by_feat_single( enc_cls_scores, enc_bbox_preds, batch_gt_instances=batch_gt_instances, batch_img_metas=batch_img_metas) loss_dict['enc_loss_cls'] = enc_loss_cls loss_dict['enc_loss_bbox'] = enc_losses_bbox loss_dict['enc_loss_iou'] = enc_losses_iou if all_layers_denoising_cls_scores is not None: # calculate denoising loss from all decoder layers dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn( all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds, batch_gt_instances=batch_gt_instances, batch_img_metas=batch_img_metas, dn_meta=dn_meta) # collate denoising loss loss_dict['dn_loss_cls'] = dn_losses_cls[-1] loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1] loss_dict['dn_loss_iou'] = dn_losses_iou[-1] for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \ enumerate(zip(dn_losses_cls[:-1], dn_losses_bbox[:-1], dn_losses_iou[:-1])): loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i return loss_dict def loss_dn(self, all_layers_denoising_cls_scores: Tensor, all_layers_denoising_bbox_preds: Tensor, batch_gt_instances: InstanceList, batch_img_metas: List[dict], dn_meta: Dict[str, int]) -> Tuple[List[Tensor]]: """Calculate denoising loss. Args: all_layers_denoising_cls_scores (Tensor): Classification scores of all decoder layers in denoising part, has shape ( num_decoder_layers, bs, num_denoising_queries, cls_out_channels). all_layers_denoising_bbox_preds (Tensor): Regression outputs of all decoder layers in denoising part. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and has shape (num_decoder_layers, bs, num_denoising_queries, 4). batch_gt_instances (list[:obj:`InstanceData`]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and 'num_denoising_groups'. It will be used for split outputs of denoising and matching parts and loss calculation. Returns: Tuple[List[Tensor]]: The loss_dn_cls, loss_dn_bbox, and loss_dn_iou of each decoder layers. """ return multi_apply( self._loss_dn_single, all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds, batch_gt_instances=batch_gt_instances, batch_img_metas=batch_img_metas, dn_meta=dn_meta) def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor, batch_gt_instances: InstanceList, batch_img_metas: List[dict], dn_meta: Dict[str, int]) -> Tuple[Tensor]: """Denoising loss for outputs from a single decoder layer. Args: dn_cls_scores (Tensor): Classification scores of a single decoder layer in denoising part, has shape (bs, num_denoising_queries, cls_out_channels). dn_bbox_preds (Tensor): Regression outputs of a single decoder layer in denoising part. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and has shape (bs, num_denoising_queries, 4). batch_gt_instances (list[:obj:`InstanceData`]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and 'num_denoising_groups'. It will be used for split outputs of denoising and matching parts and loss calculation. Returns: Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and `loss_iou`. """ cls_reg_targets = self.get_dn_targets(batch_gt_instances, batch_img_metas, dn_meta) (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets labels = torch.cat(labels_list, 0) label_weights = torch.cat(label_weights_list, 0) bbox_targets = torch.cat(bbox_targets_list, 0) bbox_weights = torch.cat(bbox_weights_list, 0) # classification loss cls_scores = dn_cls_scores.reshape(-1, self.cls_out_channels) # construct weighted avg_factor to match with the official DETR repo cls_avg_factor = \ num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( cls_scores.new_tensor([cls_avg_factor])) cls_avg_factor = max(cls_avg_factor, 1) if len(cls_scores) > 0: loss_cls = self.loss_cls( cls_scores, labels, label_weights, avg_factor=cls_avg_factor) else: loss_cls = torch.zeros( 1, dtype=cls_scores.dtype, device=cls_scores.device) # Compute the average number of gt boxes across all gpus, for # normalization purposes num_total_pos = loss_cls.new_tensor([num_total_pos]) num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() # construct factors used for rescale bboxes factors = [] for img_meta, bbox_pred in zip(batch_img_metas, dn_bbox_preds): img_h, img_w = img_meta['img_shape'] factor = bbox_pred.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0).repeat( bbox_pred.size(0), 1) factors.append(factor) factors = torch.cat(factors) # DETR regress the relative position of boxes (cxcywh) in the image, # thus the learning target is normalized by the image size. So here # we need to re-scale them for calculating IoU loss bbox_preds = dn_bbox_preds.reshape(-1, 4) bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors # regression IoU loss, defaultly GIoU loss loss_iou = self.loss_iou( bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) # regression L1 loss loss_bbox = self.loss_bbox( bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) return loss_cls, loss_bbox, loss_iou def get_dn_targets(self, batch_gt_instances: InstanceList, batch_img_metas: dict, dn_meta: Dict[str, int]) -> tuple: """Get targets in denoising part for a batch of images. Args: batch_gt_instances (list[:obj:`InstanceData`]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and 'num_denoising_groups'. It will be used for split outputs of denoising and matching parts and loss calculation. Returns: tuple: a tuple containing the following targets. - labels_list (list[Tensor]): Labels for all images. - label_weights_list (list[Tensor]): Label weights for all images. - bbox_targets_list (list[Tensor]): BBox targets for all images. - bbox_weights_list (list[Tensor]): BBox weights for all images. - num_total_pos (int): Number of positive samples in all images. - num_total_neg (int): Number of negative samples in all images. """ (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply( self._get_dn_targets_single, batch_gt_instances, batch_img_metas, dn_meta=dn_meta) num_total_pos = sum((inds.numel() for inds in pos_inds_list)) num_total_neg = sum((inds.numel() for inds in neg_inds_list)) return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) def _get_dn_targets_single(self, gt_instances: InstanceData, img_meta: dict, dn_meta: Dict[str, int]) -> tuple: """Get targets in denoising part for one image. Args: gt_instances (:obj:`InstanceData`): Ground truth of instance annotations. It should includes ``bboxes`` and ``labels`` attributes. img_meta (dict): Meta information for one image. dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and 'num_denoising_groups'. It will be used for split outputs of denoising and matching parts and loss calculation. Returns: tuple[Tensor]: a tuple containing the following for one image. - labels (Tensor): Labels of each image. - label_weights (Tensor]): Label weights of each image. - bbox_targets (Tensor): BBox targets of each image. - bbox_weights (Tensor): BBox weights of each image. - pos_inds (Tensor): Sampled positive indices for each image. - neg_inds (Tensor): Sampled negative indices for each image. """ gt_bboxes = gt_instances.bboxes gt_labels = gt_instances.labels num_groups = dn_meta['num_denoising_groups'] num_denoising_queries = dn_meta['num_denoising_queries'] num_queries_each_group = int(num_denoising_queries / num_groups) device = gt_bboxes.device if len(gt_labels) > 0: t = torch.arange(len(gt_labels), dtype=torch.long, device=device) t = t.unsqueeze(0).repeat(num_groups, 1) pos_assigned_gt_inds = t.flatten() pos_inds = torch.arange( num_groups, dtype=torch.long, device=device) pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t pos_inds = pos_inds.flatten() else: pos_inds = pos_assigned_gt_inds = \ gt_bboxes.new_tensor([], dtype=torch.long) neg_inds = pos_inds + num_queries_each_group // 2 # label targets labels = gt_bboxes.new_full((num_denoising_queries, ), self.num_classes, dtype=torch.long) labels[pos_inds] = gt_labels[pos_assigned_gt_inds] label_weights = gt_bboxes.new_ones(num_denoising_queries) # bbox targets bbox_targets = torch.zeros(num_denoising_queries, 4, device=device) bbox_weights = torch.zeros(num_denoising_queries, 4, device=device) bbox_weights[pos_inds] = 1.0 img_h, img_w = img_meta['img_shape'] # DETR regress the relative position of boxes (cxcywh) in the image. # Thus the learning target should be normalized by the image size, also # the box format should be converted from defaultly x1y1x2y2 to cxcywh. factor = gt_bboxes.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0) gt_bboxes_normalized = gt_bboxes / factor gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized) bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1]) return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds) @staticmethod def split_outputs(all_layers_cls_scores: Tensor, all_layers_bbox_preds: Tensor, dn_meta: Dict[str, int]) -> Tuple[Tensor]: """Split outputs of the denoising part and the matching part. For the total outputs of `num_queries_total` length, the former `num_denoising_queries` outputs are from denoising queries, and the rest `num_matching_queries` ones are from matching queries, where `num_queries_total` is the sum of `num_denoising_queries` and `num_matching_queries`. Args: all_layers_cls_scores (Tensor): Classification scores of all decoder layers, has shape (num_decoder_layers, bs, num_queries_total, cls_out_channels). all_layers_bbox_preds (Tensor): Regression outputs of all decoder layers. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and has shape (num_decoder_layers, bs, num_queries_total, 4). dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and 'num_denoising_groups'. Returns: Tuple[Tensor]: a tuple containing the following outputs. - all_layers_matching_cls_scores (Tensor): Classification scores of all decoder layers in matching part, has shape (num_decoder_layers, bs, num_matching_queries, cls_out_channels). - all_layers_matching_bbox_preds (Tensor): Regression outputs of all decoder layers in matching part. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and has shape (num_decoder_layers, bs, num_matching_queries, 4). - all_layers_denoising_cls_scores (Tensor): Classification scores of all decoder layers in denoising part, has shape (num_decoder_layers, bs, num_denoising_queries, cls_out_channels). - all_layers_denoising_bbox_preds (Tensor): Regression outputs of all decoder layers in denoising part. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and has shape (num_decoder_layers, bs, num_denoising_queries, 4). """ num_denoising_queries = dn_meta['num_denoising_queries'] if dn_meta is not None: all_layers_denoising_cls_scores = \ all_layers_cls_scores[:, :, : num_denoising_queries, :] all_layers_denoising_bbox_preds = \ all_layers_bbox_preds[:, :, : num_denoising_queries, :] all_layers_matching_cls_scores = \ all_layers_cls_scores[:, :, num_denoising_queries:, :] all_layers_matching_bbox_preds = \ all_layers_bbox_preds[:, :, num_denoising_queries:, :] else: all_layers_denoising_cls_scores = None all_layers_denoising_bbox_preds = None all_layers_matching_cls_scores = all_layers_cls_scores all_layers_matching_bbox_preds = all_layers_bbox_preds return (all_layers_matching_cls_scores, all_layers_matching_bbox_preds, all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) def predict(self, hidden_states: Tensor, references: List[Tensor], batch_data_samples: SampleList, rescale: bool = True) -> InstanceList: """Perform forward propagation and loss calculation of the detection head on the queries of the upstream network. Args: hidden_states (Tensor): Hidden states output from each decoder layer, has shape (num_decoder_layers, num_queries, bs, dim). references (list[Tensor]): List of the reference from the decoder. The first reference is the `init_reference` (initial) and the other num_decoder_layers(6) references are `inter_references` (intermediate). The `init_reference` has shape (bs, num_queries, 4) when `as_two_stage` of the detector is `True`, otherwise (bs, num_queries, 2). Each `inter_reference` has shape (bs, num_queries, 4) when `with_box_refine` of the detector is `True`, otherwise (bs, num_queries, 2). The coordinates are arranged as (cx, cy) when the last dimension is 2, and (cx, cy, w, h) when it is 4. batch_data_samples (list[:obj:`DetDataSample`]): The Data Samples. It usually includes information such as `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. rescale (bool, optional): If `True`, return boxes in original image space. Defaults to `True`. Returns: list[obj:`InstanceData`]: Detection results of each image after the post process. """ batch_img_metas = [ data_samples.metainfo for data_samples in batch_data_samples ] outs = self(hidden_states, references) predictions = self.predict_by_feat( *outs, batch_img_metas=batch_img_metas, rescale=rescale) return predictions def predict_by_feat(self, all_layers_cls_scores: Tensor, all_layers_bbox_preds: Tensor, batch_img_metas: List[Dict], rescale: bool = False) -> InstanceList: """Transform a batch of output features extracted from the head into bbox results. Args: all_layers_cls_scores (Tensor): Classification scores of all decoder layers, has shape (num_decoder_layers, bs, num_queries, cls_out_channels). all_layers_bbox_preds (Tensor): Regression outputs of all decoder layers. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and shape (num_decoder_layers, bs, num_queries, 4) with the last dimension arranged as (cx, cy, w, h). batch_img_metas (list[dict]): Meta information of each image. rescale (bool, optional): If `True`, return boxes in original image space. Default `False`. Returns: list[obj:`InstanceData`]: Detection results of each image after the post process. """ cls_scores = all_layers_cls_scores[-1] bbox_preds = all_layers_bbox_preds[-1] result_list = [] for img_id in range(len(batch_img_metas)): cls_score = cls_scores[img_id] bbox_pred = bbox_preds[img_id] img_meta = batch_img_metas[img_id] results = self._predict_by_feat_single(cls_score, bbox_pred, img_meta, rescale) result_list.append(results) return result_list def _predict_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor, img_meta: dict, rescale: bool = True) -> InstanceData: """Transform outputs from the last decoder layer into bbox predictions for each image. Args: cls_score (Tensor): Box score logits from the last decoder layer for each image. Shape [num_queries, cls_out_channels]. bbox_pred (Tensor): Sigmoid outputs from the last decoder layer for each image, with coordinate format (cx, cy, w, h) and shape [num_queries, 4]. img_meta (dict): Image meta info. rescale (bool): If True, return boxes in original image space. Default True. Returns: :obj:`InstanceData`: Detection results of each image after the post process. Each item usually contains following keys. - scores (Tensor): Classification scores, has a shape (num_instance, ) - labels (Tensor): Labels of bboxes, has a shape (num_instances, ). - bboxes (Tensor): Has a shape (num_instances, 4), the last dimension 4 arrange as (x1, y1, x2, y2). """ assert len(cls_score) == len(bbox_pred) # num_queries max_per_img = self.test_cfg.get('max_per_img', len(cls_score)) img_shape = img_meta['img_shape'] # exclude background if self.loss_cls.use_sigmoid: cls_score = cls_score.sigmoid() scores, indexes = cls_score.view(-1).topk(max_per_img) det_labels = indexes % self.num_classes bbox_index = indexes // self.num_classes bbox_pred = bbox_pred[bbox_index] else: scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1) scores, bbox_index = scores.topk(max_per_img) bbox_pred = bbox_pred[bbox_index] det_labels = det_labels[bbox_index] det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred) det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1] det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0] det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1]) det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0]) #lzx if self.use_nms: iou_threshold= 0.01 offset = 0 score_threshold = 0.0 # torch.mean(cls_score.view(-1))+3*torch.std(cls_score.view(-1)) max_num = 300 dets, inds = nms(det_bboxes, scores, iou_threshold, offset, score_threshold, max_num) det_bboxes = dets[:,:-1] scores = dets[:,-1] det_labels =det_labels[inds] # add by lzx if self.pre_bboxes_round: det_bboxes = adjust_bbox_to_pixel(det_bboxes) if rescale: # assert img_meta.get('scale_factor') is not None # det_bboxes /= det_bboxes.new_tensor( # img_meta['scale_factor']).repeat((1, 2)) # rw by lzx if img_meta.get('scale_factor') is not None: det_bboxes /= det_bboxes.new_tensor( img_meta['scale_factor']).repeat((1, 2)) results = InstanceData() results.bboxes = det_bboxes results.scores = scores results.labels = det_labels return results def loss_group(self, hidden_states: Tensor, references: List[Tensor], enc_outputs_class: Tensor, enc_outputs_coord: Tensor, batch_data_samples: SampleList, dn_meta: Dict[str, int], # match_group_size: Tuple[int, int] = (2, 2), each_match_num_queries: int = 200, ) -> dict: """Perform forward propagation and loss calculation of the detection head on the queries of the upstream network. Args: hidden_states (Tensor): Hidden states output from each decoder layer, has shape (num_decoder_layers, bs, num_queries_total, dim), where `num_queries_total` is the sum of `num_denoising_queries` and `num_matching_queries` when `self.training` is `True`, else `num_matching_queries`. references (list[Tensor]): List of the reference from the decoder. The first reference is the `init_reference` (initial) and the other num_decoder_layers(6) references are `inter_references` (intermediate). The `init_reference` has shape (bs, num_queries_total, 4) and each `inter_reference` has shape (bs, num_queries, 4) with the last dimension arranged as (cx, cy, w, h). enc_outputs_class (Tensor): The score of each point on encode feature map, has shape (bs, num_feat_points, cls_out_channels). enc_outputs_coord (Tensor): The proposal generate from the encode feature map, has shape (bs, num_feat_points, 4) with the last dimension arranged as (cx, cy, w, h). batch_data_samples (list[:obj:`DetDataSample`]): The Data Samples. It usually includes information such as `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and 'num_denoising_groups'. It will be used for split outputs of denoising and matching parts and loss calculation. Returns: dict: A dictionary of loss components. """ batch_gt_instances = [] batch_img_metas = [] for data_sample in batch_data_samples: batch_img_metas.append(data_sample.metainfo) batch_gt_instances.append(data_sample.gt_instances) outs = self(hidden_states, references) loss_inputs = outs + (enc_outputs_class, enc_outputs_coord, batch_gt_instances, batch_img_metas, dn_meta,each_match_num_queries) losses = self.loss_by_feat_group(*loss_inputs) return losses def loss_by_feat_group( self, all_layers_cls_scores: Tensor, all_layers_bbox_preds: Tensor, enc_cls_scores: Tensor, enc_bbox_preds: Tensor, batch_gt_instances: InstanceList, batch_img_metas: List[dict], dn_meta: Dict[str, int], # match_group_size: Tuple[int, int] = (2, 2), each_match_num_queries: int = 200, batch_gt_instances_ignore: OptInstanceList = None, ) -> Dict[str, Tensor]: """Loss function. Args: all_layers_cls_scores (Tensor): Classification scores of all decoder layers, has shape (num_decoder_layers, bs, num_queries_total, cls_out_channels), where `num_queries_total` is the sum of `num_denoising_queries` and `num_matching_queries`. all_layers_bbox_preds (Tensor): Regression outputs of all decoder layers. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and has shape (num_decoder_layers, bs, num_queries_total, 4). enc_cls_scores (Tensor): The score of each point on encode feature map, has shape (bs, num_feat_points, cls_out_channels). enc_bbox_preds (Tensor): The proposal generate from the encode feature map, has shape (bs, num_feat_points, 4) with the last dimension arranged as (cx, cy, w, h). batch_gt_instances (list[:obj:`InstanceData`]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and 'num_denoising_groups'. It will be used for split outputs of denoising and matching parts and loss calculation. batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): Batch of gt_instances_ignore. It includes ``bboxes`` attribute data that is ignored during training and testing. Defaults to None. Returns: dict[str, Tensor]: A dictionary of loss components. """ # extract denoising and matching part of outputs (all_layers_matching_cls_scores, all_layers_matching_bbox_preds, all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) = \ self.split_outputs( all_layers_cls_scores, all_layers_bbox_preds, dn_meta) match_group_num = all_layers_matching_cls_scores.shape[2]//each_match_num_queries loss_dict = dict() for id_group in range(match_group_num): all_layers_matching_cls_scores_one_group = all_layers_matching_cls_scores[:, :, id_group * each_match_num_queries:(id_group + 1) * each_match_num_queries, :] all_layers_matching_bbox_preds_one_group = all_layers_matching_bbox_preds[:, :, id_group * each_match_num_queries:(id_group + 1) * each_match_num_queries, :] # 同时计算每一解码层的loss losses_cls, losses_bbox, losses_iou = multi_apply( self.loss_by_feat_single, all_layers_matching_cls_scores_one_group, all_layers_matching_bbox_preds_one_group, batch_gt_instances=batch_gt_instances, batch_img_metas=batch_img_metas) # loss from the last decoder layer loss_dict[f'g{id_group}.loss_cls'] = losses_cls[-1] loss_dict[f'g{id_group}.loss_bbox'] = losses_bbox[-1] loss_dict[f'g{id_group}.loss_iou'] = losses_iou[-1] # loss from other decoder layers num_dec_layer = 0 for loss_cls_i, loss_bbox_i, loss_iou_i in \ zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]): loss_dict[f'g{id_group}d{num_dec_layer}.loss_cls'] = loss_cls_i loss_dict[f'g{id_group}d{num_dec_layer}.loss_bbox'] = loss_bbox_i loss_dict[f'g{id_group}d{num_dec_layer}.loss_iou'] = loss_iou_i num_dec_layer += 1 # loss_dict_one_groug = super().loss_by_feat(all_layers_matching_cls_scores_one_group, all_layers_matching_bbox_preds_one_group, # batch_gt_instances, batch_img_metas, batch_gt_instances_ignore) enc_cls_scores_one_group = enc_cls_scores[:, id_group * each_match_num_queries:(id_group + 1) * each_match_num_queries, :] enc_bbox_preds_one_group = enc_bbox_preds[:, id_group * each_match_num_queries:(id_group + 1) * each_match_num_queries, :] if enc_cls_scores is not None: # NOTE The enc_loss calculation of the DINO is # different from that of Deformable DETR. enc_loss_cls, enc_losses_bbox, enc_losses_iou = \ self.loss_by_feat_single(enc_cls_scores_one_group, enc_bbox_preds_one_group, batch_gt_instances=batch_gt_instances, batch_img_metas=batch_img_metas) loss_dict[f'g{id_group}.enc_loss_cls'] = enc_loss_cls loss_dict[f'g{id_group}.enc_loss_bbox'] = enc_losses_bbox loss_dict[f'g{id_group}.enc_loss_iou'] = enc_losses_iou # loss_dict = super().loss_by_feat( # all_layers_matching_cls_scores, all_layers_matching_bbox_preds, # batch_gt_instances, batch_img_metas, batch_gt_instances_ignore) # # NOTE DETRHead.loss_by_feat but not DeformableDETRHead.loss_by_feat # # is called, because the encoder loss calculations are different # # between DINO and DeformableDETR. # # loss of proposal generated from encode feature map. # if enc_cls_scores is not None: # # NOTE The enc_loss calculation of the DINO is # # different from that of Deformable DETR. # enc_loss_cls, enc_losses_bbox, enc_losses_iou = \ # self.loss_by_feat_single( # enc_cls_scores, enc_bbox_preds, # batch_gt_instances=batch_gt_instances, # batch_img_metas=batch_img_metas) # loss_dict['enc_loss_cls'] = enc_loss_cls # loss_dict['enc_loss_bbox'] = enc_losses_bbox # loss_dict['enc_loss_iou'] = enc_losses_iou if all_layers_denoising_cls_scores is not None: # calculate denoising loss from all decoder layers dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn( all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds, batch_gt_instances=batch_gt_instances, batch_img_metas=batch_img_metas, dn_meta=dn_meta) # collate denoising loss loss_dict['dn_loss_cls'] = dn_losses_cls[-1] loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1] loss_dict['dn_loss_iou'] = dn_losses_iou[-1] for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \ enumerate(zip(dn_losses_cls[:-1], dn_losses_bbox[:-1], dn_losses_iou[:-1])): loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i return loss_dict def loss_ddn(self, hidden_states: Tensor, references: List[Tensor], enc_outputs_class: Tensor, enc_outputs_coord: Tensor, batch_data_samples: SampleList, dn_meta: Dict[str, int]) -> dict: """Perform forward propagation and loss calculation of the detection head on the queries of the upstream network. Args: hidden_states (Tensor): Hidden states output from each decoder layer, has shape (num_decoder_layers, bs, num_queries_total, dim), where `num_queries_total` is the sum of `num_denoising_queries` and `num_matching_queries` when `self.training` is `True`, else `num_matching_queries`. references (list[Tensor]): List of the reference from the decoder. The first reference is the `init_reference` (initial) and the other num_decoder_layers(6) references are `inter_references` (intermediate). The `init_reference` has shape (bs, num_queries_total, 4) and each `inter_reference` has shape (bs, num_queries, 4) with the last dimension arranged as (cx, cy, w, h). enc_outputs_class (Tensor): The score of each point on encode feature map, has shape (bs, num_feat_points, cls_out_channels). enc_outputs_coord (Tensor): The proposal generate from the encode feature map, has shape (bs, num_feat_points, 4) with the last dimension arranged as (cx, cy, w, h). batch_data_samples (list[:obj:`DetDataSample`]): The Data Samples. It usually includes information such as `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and 'num_denoising_groups'. It will be used for split outputs of denoising and matching parts and loss calculation. Returns: dict: A dictionary of loss components. """ batch_gt_instances = [] batch_img_metas = [] for data_sample in batch_data_samples: batch_img_metas.append(data_sample.metainfo) batch_gt_instances.append(data_sample.gt_instances) outs = self(hidden_states, references) loss_inputs = outs + (enc_outputs_class, enc_outputs_coord, batch_gt_instances, batch_img_metas, dn_meta) losses,pos_bbox_offsets = self.loss_ddn_by_feat(*loss_inputs) return losses,pos_bbox_offsets def loss_ddn_by_feat( self, all_layers_cls_scores: Tensor, all_layers_bbox_preds: Tensor, enc_cls_scores: Tensor, enc_bbox_preds: Tensor, batch_gt_instances: InstanceList, batch_img_metas: List[dict], dn_meta: Dict[str, int], # match_group_size: Tuple[int, int] = (2, 2), batch_gt_instances_ignore: OptInstanceList = None, ) -> Tuple[Dict[str, Tensor], List]: """Loss function. Args: all_layers_cls_scores (Tensor): Classification scores of all decoder layers, has shape (num_decoder_layers, bs, num_queries_total, cls_out_channels), where `num_queries_total` is the sum of `num_denoising_queries` and `num_matching_queries`. all_layers_bbox_preds (Tensor): Regression outputs of all decoder layers. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and has shape (num_decoder_layers, bs, num_queries_total, 4). enc_cls_scores (Tensor): The score of each point on encode feature map, has shape (bs, num_feat_points, cls_out_channels). enc_bbox_preds (Tensor): The proposal generate from the encode feature map, has shape (bs, num_feat_points, 4) with the last dimension arranged as (cx, cy, w, h). batch_gt_instances (list[:obj:`InstanceData`]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. dn_meta (Dict[str, int]): The dictionary saves information about group collation, including 'num_denoising_queries' and 'num_denoising_groups'. It will be used for split outputs of denoising and matching parts and loss calculation. batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): Batch of gt_instances_ignore. It includes ``bboxes`` attribute data that is ignored during training and testing. Defaults to None. Returns: dict[str, Tensor]: A dictionary of loss components. """ # extract denoising and matching part of outputs (all_layers_matching_cls_scores, all_layers_matching_bbox_preds, all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) = \ self.split_outputs( all_layers_cls_scores, all_layers_bbox_preds, dn_meta) loss_dict = dict() assert batch_gt_instances_ignore is None, \ f'{self.__class__.__name__} only supports ' \ 'for batch_gt_instances_ignore setting to None.' #同时计算每一解码层的loss losses_cls, losses_bbox, losses_iou, pos_bbox_offsets = multi_apply( self.loss_ddn_by_feat_single, all_layers_matching_cls_scores, all_layers_matching_bbox_preds, batch_gt_instances=batch_gt_instances, batch_img_metas=batch_img_metas) # loss from the last decoder layer loss_dict['loss_cls'] = losses_cls[-1] loss_dict['loss_bbox'] = losses_bbox[-1] loss_dict['loss_iou'] = losses_iou[-1] # loss from other decoder layers num_dec_layer = 0 for loss_cls_i, loss_bbox_i, loss_iou_i in \ zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]): loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i num_dec_layer += 1 # NOTE DETRHead.loss_by_feat but not DeformableDETRHead.loss_by_feat # is called, because the encoder loss calculations are different # between DINO and DeformableDETR. # loss of proposal generated from encode feature map. if enc_cls_scores is not None: # NOTE The enc_loss calculation of the DINO is # different from that of Deformable DETR. enc_loss_cls, enc_losses_bbox, enc_losses_iou, pos_bbox_offsets = \ self.loss_ddn_by_feat_single( enc_cls_scores, enc_bbox_preds, batch_gt_instances=batch_gt_instances, batch_img_metas=batch_img_metas) loss_dict['enc_loss_cls'] = enc_loss_cls loss_dict['enc_loss_bbox'] = enc_losses_bbox loss_dict['enc_loss_iou'] = enc_losses_iou if all_layers_denoising_cls_scores is not None: # calculate denoising loss from all decoder layers dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn( all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds, batch_gt_instances=batch_gt_instances, batch_img_metas=batch_img_metas, dn_meta=dn_meta) # collate denoising loss loss_dict['dn_loss_cls'] = dn_losses_cls[-1] loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1] loss_dict['dn_loss_iou'] = dn_losses_iou[-1] for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \ enumerate(zip(dn_losses_cls[:-1], dn_losses_bbox[:-1], dn_losses_iou[:-1])): loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i return loss_dict, pos_bbox_offsets def loss_ddn_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor, batch_gt_instances: InstanceList, batch_img_metas: List[dict]) -> Tuple[Tensor, List]: """Loss function for outputs from a single decoder layer of a single feature level. Args: cls_scores (Tensor): Box score logits from a single decoder layer for all images, has shape (bs, num_queries, cls_out_channels). bbox_preds (Tensor): Sigmoid outputs from a single decoder layer for all images, with normalized coordinate (cx, cy, w, h) and shape (bs, num_queries, 4). batch_gt_instances (list[:obj:`InstanceData`]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and `loss_iou`. """ num_imgs = cls_scores.size(0) cls_scores_list = [cls_scores[i] for i in range(num_imgs)] bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] cls_reg_targets = self.get_targets_ddn(cls_scores_list, bbox_preds_list, batch_gt_instances, batch_img_metas) (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets labels = torch.cat(labels_list, 0) label_weights = torch.cat(label_weights_list, 0) bbox_targets = torch.cat(bbox_targets_list, 0) bbox_weights = torch.cat(bbox_weights_list, 0) # classification loss cls_scores = cls_scores.reshape(-1, self.cls_out_channels) # construct weighted avg_factor to match with the official DETR repo cls_avg_factor = num_total_pos * 1.0 + \ num_total_neg * self.bg_cls_weight if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( cls_scores.new_tensor([cls_avg_factor])) cls_avg_factor = max(cls_avg_factor, 1) loss_cls = self.loss_cls( cls_scores, labels, label_weights, avg_factor=cls_avg_factor) # Compute the average number of gt boxes across all gpus, for # normalization purposes num_total_pos = loss_cls.new_tensor([num_total_pos]) num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() # construct factors used for rescale bboxes factors = [] for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds): img_h, img_w, = img_meta['img_shape'] factor = bbox_pred.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0).repeat( bbox_pred.size(0), 1) factors.append(factor) factors = torch.cat(factors, 0) # DETR regress the relative position of boxes (cxcywh) in the image, # thus the learning target is normalized by the image size. So here # we need to re-scale them for calculating IoU loss bbox_preds = bbox_preds.reshape(-1, 4) bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors # lzx # 检查bbox_targets中4个值是否全部为0 is_target = torch.any(bbox_targets != 0, dim=1) # 获取目标的索引 target_indices = torch.nonzero(is_target).squeeze() bbox_targets_only_pos = bbox_targets[target_indices] bbox_preds_only_pos = bbox_preds[target_indices] # factors_only_pos = factors[target_indices] pos_bbox_offset = torch.mean(torch.abs(bbox_targets_only_pos-bbox_preds_only_pos),dim=0).detach().cpu().numpy() pos_bbox_offsets = [(pos_bbox_offset[0]+pos_bbox_offset[1])/2,(pos_bbox_offset[2]+pos_bbox_offset[3])/2] # regression IoU loss, defaultly GIoU loss loss_iou = self.loss_iou( bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) # regression L1 loss loss_bbox = self.loss_bbox( bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) return loss_cls, loss_bbox, loss_iou, pos_bbox_offsets def get_targets_ddn(self, cls_scores_list: List[Tensor], bbox_preds_list: List[Tensor], batch_gt_instances: InstanceList, batch_img_metas: List[dict]) -> tuple: """Compute regression and classification targets for a batch image. Outputs from a single decoder layer of a single feature level are used. Args: cls_scores_list (list[Tensor]): Box score logits from a single decoder layer for each image, has shape [num_queries, cls_out_channels]. bbox_preds_list (list[Tensor]): Sigmoid outputs from a single decoder layer for each image, with normalized coordinate (cx, cy, w, h) and shape [num_queries, 4]. batch_gt_instances (list[:obj:`InstanceData`]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: tuple: a tuple containing the following targets. - labels_list (list[Tensor]): Labels for all images. - label_weights_list (list[Tensor]): Label weights for all images. - bbox_targets_list (list[Tensor]): BBox targets for all images. - bbox_weights_list (list[Tensor]): BBox weights for all images. - num_total_pos (int): Number of positive samples in all images. - num_total_neg (int): Number of negative samples in all images. """ (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(self._get_targets_single_ddn, cls_scores_list, bbox_preds_list, batch_gt_instances, batch_img_metas) num_total_pos = sum((inds.numel() for inds in pos_inds_list)) num_total_neg = sum((inds.numel() for inds in neg_inds_list)) return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) def _get_targets_single_ddn(self, cls_score: Tensor, bbox_pred: Tensor, gt_instances: InstanceData, img_meta: dict) -> tuple: """Compute regression and classification targets for one image. Outputs from a single decoder layer of a single feature level are used. Args: cls_score (Tensor): Box score logits from a single decoder layer for one image. Shape [num_queries, cls_out_channels]. bbox_pred (Tensor): Sigmoid outputs from a single decoder layer for one image, with normalized coordinate (cx, cy, w, h) and shape [num_queries, 4]. gt_instances (:obj:`InstanceData`): Ground truth of instance annotations. It should includes ``bboxes`` and ``labels`` attributes. img_meta (dict): Meta information for one image. Returns: tuple[Tensor]: a tuple containing the following for one image. - labels (Tensor): Labels of each image. - label_weights (Tensor]): Label weights of each image. - bbox_targets (Tensor): BBox targets of each image. - bbox_weights (Tensor): BBox weights of each image. - pos_inds (Tensor): Sampled positive indices for each image. - neg_inds (Tensor): Sampled negative indices for each image. """ img_h, img_w = img_meta['img_shape'] factor = bbox_pred.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0) num_bboxes = bbox_pred.size(0) # convert bbox_pred from xywh, normalized to xyxy, unnormalized bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred) bbox_pred = bbox_pred * factor pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred) # assigner and sampler assign_result = self.assigner.assign( pred_instances=pred_instances, gt_instances=gt_instances, img_meta=img_meta) # from mmdet.models.task_modules.assigners import MaxIoUAssigner # assigner1 = MaxIoUAssigner(pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.3,match_low_quality=True,ignore_iof_thr=-1) # pred_instances1 = InstanceData() # pred_instances1.priors =pred_instances.bboxes # assign_result = assigner1.assign(pred_instances=pred_instances1, gt_instances=gt_instances, img_meta=img_meta) gt_bboxes = gt_instances.bboxes gt_labels = gt_instances.labels pos_inds = torch.nonzero( assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() neg_inds = torch.nonzero( assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :] # label targets labels = gt_bboxes.new_full((num_bboxes, ), self.num_classes, dtype=torch.long) labels[pos_inds] = gt_labels[pos_assigned_gt_inds] label_weights = gt_bboxes.new_ones(num_bboxes) # bbox targets bbox_targets = torch.zeros_like(bbox_pred) bbox_weights = torch.zeros_like(bbox_pred) bbox_weights[pos_inds] = 1.0 # DETR regress the relative position of boxes (cxcywh) in the image. # Thus the learning target should be normalized by the image size, also # the box format should be converted from defaultly x1y1x2y2 to cxcywh. pos_gt_bboxes_normalized = pos_gt_bboxes / factor pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized) bbox_targets[pos_inds] = pos_gt_bboxes_targets return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds)