# Copyright (c) OpenMMLab. All rights reserved. from typing import Dict, List, Tuple import torch from mmengine.structures import InstanceData from torch import Tensor import torch.nn.functional as F from mmdet.registry import MODELS from mmdet.structures import SampleList from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh from mmdet.utils import InstanceList, OptInstanceList, reduce_mean from ..utils import multi_apply from .deformable_detr_head import DeformableDETRHead from .dino_head import DINOHead # def bbox_cxcywh_to_xyxy_1pixel(bbox: Tensor, factors: Tensor) -> Tensor: # """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, x2, y2). # # Args: # bbox (Tensor): Shape (n, 4) for bboxes. # # Returns: # Tensor: Converted bboxes. # # """ # bbox1 = bbox*factors # cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1) # # bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), (cx + 0.5 * w), (cy + 0.5 * h)] # return torch.cat(bbox_new, dim=-1) def adjust_bbox_to_pixel(bboxes: Tensor): # 向下取整得到目标的左上角坐标 adjusted_bboxes = torch.floor(bboxes) # 向上取整得到目标的右下角坐标 adjusted_bboxes[:, 2:] = torch.ceil(bboxes[:, 2:]) return adjusted_bboxes def adjust_bbox_to_pixelV2(bboxes: Tensor): # 向下取整得到目标的左上角坐标 adjusted_bboxes = torch.floor(bboxes) # 向上取整得到目标的右下角坐标 adjusted_bboxes[:, 2:] = torch.ceil(bboxes[:, 2:]) return adjusted_bboxes @MODELS.register_module() class DINOSTHead(DINOHead): def __init__(self, *args, round_coord: bool = False, **kwargs) -> None: self.round_coord = round_coord super().__init__(*args, **kwargs) def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor, batch_gt_instances: InstanceList, batch_img_metas: List[dict]) -> Tuple[Tensor]: """Loss function for outputs from a single decoder layer of a single feature level. Args: cls_scores (Tensor): Box score logits from a single decoder layer for all images, has shape (bs, num_queries, cls_out_channels). bbox_preds (Tensor): Sigmoid outputs from a single decoder layer for all images, with normalized coordinate (cx, cy, w, h) and shape (bs, num_queries, 4). batch_gt_instances (list[:obj:`InstanceData`]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and `loss_iou`. """ num_imgs = cls_scores.size(0) cls_scores_list = [cls_scores[i] for i in range(num_imgs)] bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, batch_gt_instances, batch_img_metas) (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets labels = torch.cat(labels_list, 0) label_weights = torch.cat(label_weights_list, 0) bbox_targets = torch.cat(bbox_targets_list, 0) bbox_weights = torch.cat(bbox_weights_list, 0) # classification loss cls_scores = cls_scores.reshape(-1, self.cls_out_channels) # construct weighted avg_factor to match with the official DETR repo cls_avg_factor = num_total_pos * 1.0 + \ num_total_neg * self.bg_cls_weight if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( cls_scores.new_tensor([cls_avg_factor])) cls_avg_factor = max(cls_avg_factor, 1) loss_cls = self.loss_cls( cls_scores, labels, label_weights, avg_factor=cls_avg_factor) # Compute the average number of gt boxes across all gpus, for # normalization purposes num_total_pos = loss_cls.new_tensor([num_total_pos]) num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() # construct factors used for rescale bboxes factors = [] for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds): img_h, img_w, = img_meta['img_shape'] factor = bbox_pred.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0).repeat( bbox_pred.size(0), 1) factors.append(factor) factors = torch.cat(factors, 0) # DETR regress the relative position of boxes (cxcywh) in the image, # thus the learning target is normalized by the image size. So here # we need to re-scale them for calculating IoU loss bbox_preds = bbox_preds.reshape(-1, 4) bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors bboxes=adjust_bbox_to_pixel(bboxes) bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors # # regression IoU loss, defaultly GIoU loss loss_iou = self.loss_iou( bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) # regression L1 loss loss_bbox = self.loss_bbox( bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) return loss_cls, loss_bbox, loss_iou def _predict_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor, img_meta: dict, rescale: bool = True) -> InstanceData: """Transform outputs from the last decoder layer into bbox predictions for each image. Args: cls_score (Tensor): Box score logits from the last decoder layer for each image. Shape [num_queries, cls_out_channels]. bbox_pred (Tensor): Sigmoid outputs from the last decoder layer for each image, with coordinate format (cx, cy, w, h) and shape [num_queries, 4]. img_meta (dict): Image meta info. rescale (bool): If True, return boxes in original image space. Default True. Returns: :obj:`InstanceData`: Detection results of each image after the post process. Each item usually contains following keys. - scores (Tensor): Classification scores, has a shape (num_instance, ) - labels (Tensor): Labels of bboxes, has a shape (num_instances, ). - bboxes (Tensor): Has a shape (num_instances, 4), the last dimension 4 arrange as (x1, y1, x2, y2). """ assert len(cls_score) == len(bbox_pred) # num_queries max_per_img = self.test_cfg.get('max_per_img', len(cls_score)) img_shape = img_meta['img_shape'] # exclude background if self.loss_cls.use_sigmoid: cls_score = cls_score.sigmoid() scores, indexes = cls_score.view(-1).topk(max_per_img) det_labels = indexes % self.num_classes bbox_index = indexes // self.num_classes bbox_pred = bbox_pred[bbox_index] else: scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1) scores, bbox_index = scores.topk(max_per_img) bbox_pred = bbox_pred[bbox_index] det_labels = det_labels[bbox_index] det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred) det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1] det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0] det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1]) det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0]) if rescale: # assert img_meta.get('scale_factor') is not None # det_bboxes /= det_bboxes.new_tensor( # img_meta['scale_factor']).repeat((1, 2)) # rw by lzx if img_meta.get('scale_factor') is not None: det_bboxes /= det_bboxes.new_tensor( img_meta['scale_factor']).repeat((1, 2)) results = InstanceData() results.bboxes = det_bboxes results.scores = scores results.labels = det_labels return results """ import matplotlib.pyplot as plt # 绘制直方图 bboxes_gt bboxes_gt_np = bboxes_gt.detach().cpu().numpy() result = bbox_preds.detach().cpu().numpy() result = (bboxes[:, 2] - bboxes[:, 0]) * (bboxes[:, 3] - bboxes[:, 1]) result = result.detach().cpu().numpy() plt.hist(result, bins=50, edgecolor='black') plt.xlabel('Result') plt.ylabel('Frequency') plt.title('Histogram of Result') plt.show() """