Spaces:

SebasJanampa
/

LINEA

Running

App Files Files Community

SebasJanampa commited on Feb 22

Commit

51b2bf9

verified ·

1 Parent(s): 67b2aee

Upload 45 files

Browse files

Files changed (45) hide show

linea/configs/linea/include/dataset.py +10 -0
linea/configs/linea/include/linea.py +62 -0
linea/configs/linea/include/optimizer.py +9 -0
linea/configs/linea/linea_hgnetv2_l.py +56 -0
linea/configs/linea/linea_hgnetv2_m.py +63 -0
linea/configs/linea/linea_hgnetv2_n.py +63 -0
linea/configs/linea/linea_hgnetv2_s.py +64 -0
linea/models/__init__.py +8 -0
linea/models/__pycache__/__init__.cpython-311.pyc +0 -0
linea/models/__pycache__/registry.cpython-311.pyc +0 -0
linea/models/linea/__init__.py +11 -0
linea/models/linea/__pycache__/__init__.cpython-311.pyc +0 -0
linea/models/linea/__pycache__/attention_mechanism.cpython-311.pyc +0 -0
linea/models/linea/__pycache__/criterion.cpython-311.pyc +0 -0
linea/models/linea/__pycache__/decoder.cpython-311.pyc +0 -0
linea/models/linea/__pycache__/dn_components.cpython-311.pyc +0 -0
linea/models/linea/__pycache__/hgnetv2.cpython-311.pyc +0 -0
linea/models/linea/__pycache__/hybrid_encoder_asymmetric_conv.cpython-311.pyc +0 -0
linea/models/linea/__pycache__/linea.cpython-311.pyc +0 -0
linea/models/linea/__pycache__/linea_utils.cpython-311.pyc +0 -0
linea/models/linea/__pycache__/matcher.cpython-311.pyc +0 -0
linea/models/linea/__pycache__/utils.cpython-311.pyc +0 -0
linea/models/linea/attention_mechanism.py +593 -0
linea/models/linea/criterion.py +517 -0
linea/models/linea/decoder.py +551 -0
linea/models/linea/dn_components.py +178 -0
linea/models/linea/hgnetv2.py +595 -0
linea/models/linea/hybrid_encoder.py +471 -0
linea/models/linea/hybrid_encoder_asymmetric_conv.py +549 -0
linea/models/linea/linea.py +156 -0
linea/models/linea/linea_utils.py +165 -0
linea/models/linea/matcher.py +180 -0
linea/models/linea/new_dn_components.py +163 -0
linea/models/linea/position_encoding.py +150 -0
linea/models/linea/utils.py +139 -0
linea/models/registry.py +58 -0
linea/requirements.txt +9 -0
linea/util/__init__.py +1 -0
linea/util/__pycache__/__init__.cpython-311.pyc +0 -0
linea/util/__pycache__/misc.cpython-311.pyc +0 -0
linea/util/__pycache__/slconfig.cpython-311.pyc +0 -0
linea/util/get_param_dicts.py +35 -0
linea/util/misc.py +275 -0
linea/util/profiler.py +21 -0
linea/util/slconfig.py +440 -0

linea/configs/linea/include/dataset.py ADDED Viewed

	@@ -0,0 +1,10 @@

+data_aug_scales = [(640, 640)]
+data_aug_max_size = 1333
+data_aug_scales2_resize = [400, 500, 600]
+data_aug_scales2_crop = [384, 600]
+data_aug_scale_overlap = None
+batch_size_train = 8
+batch_size_val = 64

linea/configs/linea/include/linea.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# model
+modelname = 'LINEA'
+eval_spatial_size = (640, 640)
+eval_idx = 5 # 6 decoder layers
+num_classes = 2
+## backbone
+pretrained = True
+use_checkpoint = False
+return_interm_indices = [1, 2, 3]
+freeze_norm = True
+freeze_stem_only = True
+## encoder
+hybrid_encoder = 'hybrid_encoder_asymmetric_conv'
+in_channels_encoder = [512, 1024, 2048]
+pe_temperatureH = 20
+pe_temperatureW = 20
+## encoder
+transformer_activation = 'relu'
+batch_norm_type = 'FrozenBatchNorm2d'
+masks = False
+aux_loss = True
+## decoder
+num_queries = 1100
+query_dim = 4
+num_feature_levels = 3
+dec_n_points = [4, 1, 1]
+dropout = 0.0
+pre_norm = False
+# denoise
+use_dn = True
+dn_number = 300
+dn_line_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+embed_init_tgt = True
+dn_labelbook_size = 2
+match_unstable_error = True
+# matcher
+set_cost_class = 2.0
+set_cost_lines = 5.0
+# criterion
+criterionname = 'LINEACRITERION'
+criterion_type = 'default'
+weight_dict = {'loss_logits': 1, 'loss_line': 5}
+losses = ['labels', 'lines']
+focal_alpha = 0.1
+matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher
+nms_iou_threshold = -1
+# for ema
+use_ema = False
+ema_decay = 0.9997
+ema_epoch = 0

linea/configs/linea/include/optimizer.py ADDED Viewed

	@@ -0,0 +1,9 @@

+lr = 0.00025
+weight_decay = 0.000125
+betas = [0.9, 0.999]
+epochs = 12
+lr_drop_list = [11]
+clip_max_norm = 0.1
+save_checkpoint_interval = 1

linea/configs/linea/linea_hgnetv2_l.py ADDED Viewed

	@@ -0,0 +1,56 @@

+_base_ = [
+	'./include/dataset.py',
+	'./include/optimizer.py',
+	'./include/linea.py'
+	]
+output_dir = 'output/line_hgnetv2_l'
+# backbone
+backbone = 'HGNetv2_B4'
+param_dict_type = backbone.lower()
+use_lab = False
+# transformer
+feat_strides = [8, 16, 32]
+hidden_dim = 256
+dim_feedforward = 1024
+nheads = 8
+use_lmap = False
+## encoder
+hybrid_encoder = 'hybrid_encoder_asymmetric_conv'
+in_channels_encoder = [512, 1024, 2048]
+pe_temperatureH = 20
+pe_temperatureW = 20
+expansion = 0.5
+depth_mult = 1.0
+## decoder
+feat_channels_decoder = [256, 256, 256]
+dec_layers = 6
+num_queries = 1100
+num_select = 300
+reg_max = 16
+reg_scale = 4
+# criterion
+weight_dict = {'loss_logits': 4, 'loss_line': 5}
+use_warmup = False
+# optimizer params
+model_parameters = [
+	{
+	'params': '^(?=.*backbone)(?!.*norm|bn).*$',
+    'lr': 0.0000125
+    },
+    {
+    'params': '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$',
+    'weight_decay': 0.
+    }
+]
+lr = 0.00025
+betas = [0.9, 0.999]
+weight_decay = 0.000125

linea/configs/linea/linea_hgnetv2_m.py ADDED Viewed

	@@ -0,0 +1,63 @@

+_base_ = [
+	'./include/dataset.py',
+	'./include/optimizer.py',
+	'./include/linea.py'
+	]
+output_dir = 'output/line_hgnetv2_m'
+# backbone
+backbone = 'HGNetv2_B2'
+use_lab = True
+freeze_norm = False
+freeze_stem_only = True
+# transformer
+feat_strides = [8, 16, 32]
+hidden_dim = 256
+dim_feedforward = 512
+nheads = 8
+use_lmap = False
+## encoder
+hybrid_encoder = 'hybrid_encoder_asymmetric_conv'
+in_channels_encoder = [384, 768, 1536]
+pe_temperatureH = 20
+pe_temperatureW = 20
+expansion = 0.34
+depth_mult = 1.0
+## decoder
+feat_channels_decoder = [hidden_dim, hidden_dim, hidden_dim]
+dec_layers = 4
+num_queries = 1100
+num_select = 300
+reg_max = 16
+reg_scale = 4
+eval_idx = 3
+# criterion
+epochs = 24
+lr_drop_list = [20]
+weight_dict = {'loss_logits': 2, 'loss_line': 5}
+use_warmup = False
+# optimizer params
+model_parameters = [
+	{
+    'params': '^(?=.*backbone)(?!.*norm|bn).*$',
+    'lr': 0.00002
+    },
+    {
+    'params': '^(?=.*backbone)(?=.*norm|bn).*$',
+      'lr': 0.00002,
+      'weight_decay': 0.
+    },
+    {
+    'params': '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$',
+    'weight_decay': 0.
+    }
+]
+lr = 0.0002
+betas = [0.9, 0.999]
+weight_decay = 0.0001

linea/configs/linea/linea_hgnetv2_n.py ADDED Viewed

	@@ -0,0 +1,63 @@

+_base_ = [
+	'./include/dataset.py',
+	'./include/optimizer.py',
+	'./include/linea.py'
+	]
+output_dir = 'output/line_hgnetv2_n'
+# backbone
+backbone = 'HGNetv2_B0'
+use_lab = True
+freeze_norm = False
+freeze_stem_only = True
+# transformer
+feat_strides = [8, 16, 32]
+hidden_dim = 128
+dim_feedforward = 512
+nheads = 8
+use_lmap = False
+## encoder
+hybrid_encoder = 'hybrid_encoder_asymmetric_conv'
+in_channels_encoder = [256, 512, 1024]
+pe_temperatureH = 20
+pe_temperatureW = 20
+expansion = 0.34
+depth_mult = 0.5
+## decoder
+feat_channels_decoder = [hidden_dim, hidden_dim, hidden_dim]
+dec_layers = 3
+num_queries = 1100
+num_select = 300
+reg_max = 16
+reg_scale = 4
+eval_idx = 2
+# criterion
+epochs = 72
+lr_drop_list = [60]
+weight_dict = {'loss_logits': 2, 'loss_line': 5}
+use_warmup = False
+# optimizer params
+model_parameters = [
+	{
+    'params': '^(?=.*backbone)(?!.*norm|bn).*$',
+    'lr': 0.0004
+    },
+    {
+    'params': '^(?=.*backbone)(?=.*norm|bn).*$',
+      'lr': 0.0004,
+      'weight_decay': 0.
+    },
+    {
+    'params': '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$',
+    'weight_decay': 0.
+    }
+]
+lr = 0.0008
+betas = [0.9, 0.999]
+weight_decay = 0.0001

linea/configs/linea/linea_hgnetv2_s.py ADDED Viewed

	@@ -0,0 +1,64 @@

+_base_ = [
+	'./include/dataset.py',
+	'./include/optimizer.py',
+	'./include/linea.py'
+	]
+output_dir = 'output/line_hgnetv2_s'
+# backbone
+backbone = 'HGNetv2_B1'
+use_lab = True
+freeze_norm = False
+freeze_stem_only = True
+# transformer
+feat_strides = [8, 16, 32]
+hidden_dim = 256
+dim_feedforward = 512
+nheads = 8
+use_lmap = False
+## encoder
+hybrid_encoder = 'hybrid_encoder_asymmetric_conv'
+in_channels_encoder = [256, 512, 1024]
+pe_temperatureH = 20
+pe_temperatureW = 20
+expansion = 0.34
+depth_mult = 0.5
+## decoder
+feat_channels_decoder = [hidden_dim, hidden_dim, hidden_dim]
+dec_layers = 3
+num_queries = 1100
+num_select = 300
+reg_max = 16
+reg_scale = 4
+eval_idx = 2
+# criterion
+epochs = 36
+lr_drop_list = [25]
+weight_dict = {'loss_logits': 2, 'loss_line': 5}
+use_warmup = True
+warmup_iters = 625 * 5
+# optimizer params
+model_parameters = [
+	{
+    'params': '^(?=.*backbone)(?!.*norm|bn).*$',
+    'lr': 0.0001
+    },
+    {
+    'params': '^(?=.*backbone)(?=.*norm|bn).*$',
+      'lr': 0.0001,
+      'weight_decay': 0.
+    },
+    {
+    'params': '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$',
+    'weight_decay': 0.
+    }
+]
+lr = 0.0002
+betas = [0.9, 0.999]
+weight_decay = 0.0001

linea/models/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .linea import build_linea

linea/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (210 Bytes). View file

linea/models/__pycache__/registry.cpython-311.pyc ADDED Viewed

Binary file (3.14 kB). View file

linea/models/linea/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+from .linea import build_linea
+from .criterion import build_criterion

linea/models/linea/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (281 Bytes). View file

linea/models/linea/__pycache__/attention_mechanism.cpython-311.pyc ADDED Viewed

Binary file (17.4 kB). View file

linea/models/linea/__pycache__/criterion.cpython-311.pyc ADDED Viewed

Binary file (40.7 kB). View file

linea/models/linea/__pycache__/decoder.cpython-311.pyc ADDED Viewed

Binary file (31.3 kB). View file

linea/models/linea/__pycache__/dn_components.cpython-311.pyc ADDED Viewed

Binary file (11.9 kB). View file

linea/models/linea/__pycache__/hgnetv2.cpython-311.pyc ADDED Viewed

Binary file (25.1 kB). View file

linea/models/linea/__pycache__/hybrid_encoder_asymmetric_conv.cpython-311.pyc ADDED Viewed

Binary file (35 kB). View file

linea/models/linea/__pycache__/linea.cpython-311.pyc ADDED Viewed

Binary file (7.82 kB). View file

linea/models/linea/__pycache__/linea_utils.cpython-311.pyc ADDED Viewed

Binary file (10.1 kB). View file

linea/models/linea/__pycache__/matcher.cpython-311.pyc ADDED Viewed

Binary file (10.6 kB). View file

linea/models/linea/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (9.26 kB). View file

linea/models/linea/attention_mechanism.py ADDED Viewed

	@@ -0,0 +1,593 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.init import xavier_uniform_, constant_
+import math
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights, total_num_points):
+    # for debug and test only,
+    # need to use cuda version instead
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, P_, _ = sampling_locations[0].shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    # sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = (2 * sampling_locations[lid_] - 1).transpose(1, 2).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
+                                          mode='bilinear', padding_mode='zeros', align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, total_num_points)
+    output = (torch.cat(sampling_value_list, dim=-1) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
+    return output.transpose(1, 2).contiguous()
+def ms_deform_attn_core_pytorchv2(value, value_spatial_shapes, sampling_locations, attention_weights, num_points_list):
+    # for debug and test only,
+    # need to use cuda version instead
+    _, D_ , _= value[0].shape
+    N_, Lq_, M_, _, _ = sampling_locations.shape
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_grids = sampling_grids.permute(0, 2, 1, 3, 4).flatten(0, 1)
+    sampling_locations_list = sampling_grids.split(num_points_list, dim=-2)
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_* M_, D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value[lid_].unflatten(2, (H_, W_))
+        # N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_locations_list[lid_]
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
+                                          mode='bilinear', padding_mode='zeros', align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, sum(num_points_list))
+    output = (torch.cat(sampling_value_list, dim=-1) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
+    return output.transpose(1, 2).contiguous()
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
+        """
+        Multi-Scale Deformable Attention Module
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                          "which is more efficient in our CUDA implementation.")
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+        self._reset_parameters()
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+    def forward(self, query, reference_points, input_flatten, input_spatial_shapes):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        value = self.value_proj(input_flatten)
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+        output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
+        output = self.output_proj(output)
+        return output
+class MSDeformLineAttn(nn.Module):
+    def __init__(
+        self,
+        d_model=256,
+        n_levels=4,
+        n_heads=8,
+        n_points=4
+        ):
+        """
+        This version is inspired from DFine. We removed the following layers:
+        -   value_proj
+        -   output_proj
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        if isinstance(n_points, list):
+            assert len(n_points) == n_levels, ''
+            num_points_list = n_points
+        else:
+            num_points_list = [n_points for _ in range(n_levels)]
+        self.num_points_list = num_points_list
+        self.total_num_points = sum(num_points_list)
+        num_points_scale = [1/n for n in num_points_list for _ in range(n)]
+        self.register_buffer('num_points_scale', torch.tensor(num_points_scale, dtype=torch.float32).reshape(-1, 1))
+        self.sampling_ratios = nn.Linear(d_model, n_heads * sum(num_points_list))
+        self.attention_weights = nn.Linear(d_model, n_heads * sum(num_points_list))
+        self._reset_parameters()
+    def _reset_parameters(self):
+        constant_(self.sampling_ratios.weight.data, 0.)
+        with torch.no_grad():
+            self.sampling_ratios.bias = nn.Parameter(torch.linspace(-1, 1, self.n_heads * self.total_num_points))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+    def forward(self, query, reference_points, value, value_spatial_shapes):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param value               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param value_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :return output                     (N, Length_{query}, C)
+        ####################################################################
+        # Difference respect to MSDeformAttn
+        # The query already stores the line's junctions
+        # :param reference_points is not needed. We keep it to make both
+                MSDeformAttn and MSDeformLineAttn interchangebale
+                between different frameworks
+        # MSDeformLineAttn does not generate offsets. Instead, it samples
+                n_points equally-spaced points from the line segment
+        ####################################################################
+        """
+        N, Len_q, _ = query.shape
+        sampling_ratios = self.sampling_ratios(query).view(N, Len_q, self.n_heads, self.total_num_points, 1)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.total_num_points)
+        attention_weights = F.softmax(attention_weights, -1)
+        num_points_scale = self.num_points_scale.to(dtype=query.dtype)
+        vector = reference_points[:, :, None, :, :2] - reference_points[:, :, None, :, 2:]
+        center = 0.5 * (reference_points[:, :, None, :, :2] + reference_points[:, :, None, :, 2:])
+        sampling_locations = center + sampling_ratios * num_points_scale * vector * 0.5
+        output = ms_deform_attn_core_pytorchv2(
+            value,
+            value_spatial_shapes,
+            sampling_locations,
+            attention_weights,
+            self.num_points_list
+        )
+        return output
+#######################
+## Previous versions ##
+#######################
+# class MSDeformLineAttn(nn.Module):
+#     def __init__(
+#         self,
+#         d_model=256,
+#         n_levels=4,
+#         n_heads=8,
+#         n_points=4
+#     ):
+#         """
+#         Multi-Scale Deformable Attention Module
+#         :param d_model      hidden dimension
+#         :param n_levels     number of feature levels
+#         :param n_heads      number of attention heads
+#         :param n_points     number of sampling points per attention head per feature level
+#         """
+#         super().__init__()
+#         if d_model % n_heads != 0:
+#             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+#         _d_per_head = d_model // n_heads
+#         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+#         if not _is_power_of_2(_d_per_head):
+#             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+#                           "which is more efficient in our CUDA implementation.")
+#         self.d_model = d_model
+#         self.n_levels = n_levels
+#         self.n_heads = n_heads
+#         if isinstance(n_points, list):
+#             assert len(n_points) == n_levels, ''
+#             num_points_list = n_points
+#         else:
+#             num_points_list = [n_points for _ in range(n_levels)]
+#         self.num_points_list = num_points_list
+#         self.total_num_points = sum(num_points_list)
+#         num_points_scale = [1/n for n in num_points_list]
+#         self.register_buffer('num_points_scale', torch.tensor(num_points_scale, dtype=torch.float32).reshape(-1, 1, 1))
+#         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * 4)
+#         self.attention_weights = nn.Linear(d_model, n_heads * sum(num_points_list))
+#         self.value_proj = nn.Linear(d_model, d_model)
+#         self.output_proj = nn.Linear(d_model, d_model)
+#         for i in range(len(num_points_list)):
+#             if num_points_list[i] == 1:
+#                 lambda_  = torch.linspace(0.5, 0.5, num_points_list[i])[:, None]
+#             else:
+#                 lambda_  = torch.linspace(0, 1, num_points_list[i])[:, None]
+#             self.register_buffer(f"lambda_{i}", lambda_)
+#         self._reset_parameters()
+#     def _reset_parameters(self):
+#         constant_(self.sampling_offsets.weight.data, 0.)
+#         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+#         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+#         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, 2, 1)
+#         for i in range(1):
+#             grid_init[:, :, 2*i, :] *= i + 1
+#             grid_init[:, :, 2*i+1, :] *= i + 1
+#         with torch.no_grad():
+#             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+#         constant_(self.attention_weights.weight.data, 0.)
+#         constant_(self.attention_weights.bias.data, 0.)
+#         xavier_uniform_(self.value_proj.weight.data)
+#         constant_(self.value_proj.bias.data, 0.)
+#         xavier_uniform_(self.output_proj.weight.data)
+#         constant_(self.output_proj.bias.data, 0.)
+#     def forward(self, query, reference_points, input_flatten, input_spatial_shapes):
+#         """
+#         :param query                       (N, Length_{query}, C)
+#         :param reference_points            (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+#         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+#         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+#         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+#         :return output                     (N, Length_{query}, C)
+#         ####################################################################
+#         # Difference respect to MSDeformAttn
+#         # The query already stores the line's junctions
+#         # :param reference_points is not needed. We keep it to make both
+#                 MSDeformAttn and MSDeformLineAttn interchangebale
+#                 between different frameworks
+#         # MSDeformLineAttn does not generate offsets. Instead, it samples
+#                 n_points equally-spaced points from the line segment
+#         ####################################################################
+#         """
+#         N, Len_q, _ = query.shape
+#         N, Len_in, _ = input_flatten.shape
+#         value = self.value_proj(input_flatten)
+#         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+#         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, 1, 4)
+#         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.total_num_points)
+#         attention_weights = F.softmax(attention_weights, -1)
+#         num_points_scale = self.num_points_scale.to(dtype=query.dtype)
+#         wh = reference_points[:, :, None, :, None, :2] - reference_points[:, :, None, :, None, 2:]
+#         center = 0.5 * (reference_points[:, :, None, :, None, :2] + reference_points[:, :, None, :, None, 2:])
+#         sampling_junctions = torch.cat((center, center), dim=-1) \
+#                                 + sampling_offsets * num_points_scale * torch.cat([wh, wh], -1) * 0.5
+#         sampling_locations = []
+#         # sampling_junctions_level = torch.split(sampling_junctions, self.num_points_list, dim=-2)
+#         for i in range(len(self.num_points_list)):
+#             lambda_ = getattr(self, f'lambda_{i}')
+#             junctions = sampling_junctions[:, :, :, i]
+#             locations = junctions[..., :2] * lambda_ + junctions[..., 2:] * (1 - lambda_)
+#             sampling_locations.append(locations)
+#         output = ms_deform_attn_core_pytorch(
+#             value,
+#             input_spatial_shapes,
+#             sampling_locations,
+#             attention_weights,
+#             self.total_num_points
+#         )
+#         output = self.output_proj(output)
+#         return output
+# class MSDeformLineAttnV2(nn.Module):
+#     def __init__(
+#         self,
+#         d_model=256,
+#         n_levels=4,
+#         n_heads=8,
+#         n_points=4
+#     ):
+#         """
+#         Multi-Scale Deformable Attention Module
+#         :param d_model      hidden dimension
+#         :param n_levels     number of feature levels
+#         :param n_heads      number of attention heads
+#         :param n_points     number of sampling points per attention head per feature level
+#         """
+#         super().__init__()
+#         if d_model % n_heads != 0:
+#             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+#         _d_per_head = d_model // n_heads
+#         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+#         if not _is_power_of_2(_d_per_head):
+#             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+#                           "which is more efficient in our CUDA implementation.")
+#         self.d_model = d_model
+#         self.n_levels = n_levels
+#         self.n_heads = n_heads
+#         if isinstance(n_points, list):
+#             assert len(n_points) == n_levels, ''
+#             num_points_list = n_points
+#         else:
+#             num_points_list = [n_points for _ in range(n_levels)]
+#         self.num_points_list = num_points_list
+#         self.total_num_points = sum(num_points_list)
+#         num_points_scale = [1/n for n in num_points_list]
+#         self.register_buffer('num_points_scale', torch.tensor(num_points_scale, dtype=torch.float32).reshape(-1, 1, 1))
+#         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * 4)
+#         self.sampling_ratios = nn.Linear(d_model, n_heads * sum(num_points_list))
+#         self.attention_weights = nn.Linear(d_model, n_heads * sum(num_points_list))
+#         self.value_proj = nn.Linear(d_model, d_model)
+#         self.output_proj = nn.Linear(d_model, d_model)
+#         self._reset_parameters()
+#     def _reset_parameters(self):
+#         constant_(self.sampling_offsets.weight.data, 0.)
+#         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+#         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+#         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, 2, 1)
+#         for i in range(1):
+#             grid_init[:, :, 2*i, :] *= i + 1
+#             grid_init[:, :, 2*i+1, :] *= i + 1
+#         with torch.no_grad():
+#             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+#         constant_(self.attention_weights.weight.data, 0.)
+#         constant_(self.attention_weights.bias.data, 0.)
+#         xavier_uniform_(self.value_proj.weight.data)
+#         constant_(self.value_proj.bias.data, 0.)
+#         xavier_uniform_(self.output_proj.weight.data)
+#         constant_(self.output_proj.bias.data, 0.)
+#     def forward(self, query, reference_points, input_flatten, input_spatial_shapes):
+#         """
+#         :param query                       (N, Length_{query}, C)
+#         :param reference_points            (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+#         :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+#         :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+#         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+#         :return output                     (N, Length_{query}, C)
+#         ####################################################################
+#         # Difference respect to MSDeformAttn
+#         # The query already stores the line's junctions
+#         # :param reference_points is not needed. We keep it to make both
+#                 MSDeformAttn and MSDeformLineAttn interchangebale
+#                 between different frameworks
+#         # MSDeformLineAttn does not generate offsets. Instead, it samples
+#                 n_points equally-spaced points from the line segment
+#         ####################################################################
+#         """
+#         N, Len_q, _ = query.shape
+#         N, Len_in, _ = input_flatten.shape
+#         value = self.value_proj(input_flatten)
+#         value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+#         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, 1, 4)
+#         sampling_ratios = self.sampling_ratios(query).view(N, Len_q, self.n_heads, self.total_num_points).sigmoid()
+#         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.total_num_points)
+#         attention_weights = F.softmax(attention_weights, -1)
+#         num_points_scale = self.num_points_scale.to(dtype=query.dtype)
+#         wh = reference_points[:, :, None, :, None, :2] - reference_points[:, :, None, :, None, 2:]
+#         center = 0.5 * (reference_points[:, :, None, :, None, :2] + reference_points[:, :, None, :, None, 2:])
+#         sampling_junctions = torch.cat((center, center), dim=-1) \
+#                                 + sampling_offsets * num_points_scale * torch.cat([wh, wh], -1) * 0.5
+#         sampling_locations = []
+#         for i, lambda_ in enumerate(torch.split(sampling_ratios, self.num_points_list, dim=-1)):
+#             lambda_ = lambda_[..., None]
+#             junctions = sampling_junctions[:, :, :, i]
+#             locations = junctions[..., :2] * lambda_ + junctions[..., 2:] * (1 - lambda_)
+#             sampling_locations.append(locations)
+#         output = ms_deform_attn_core_pytorch(
+#             value,
+#             input_spatial_shapes,
+#             sampling_locations,
+#             attention_weights,
+#             self.total_num_points
+#         )
+#         output = self.output_proj(output)
+#         return output
+# class MSDeformLineAttnV3(nn.Module):
+#     def __init__(
+#         self,
+#         d_model=256,
+#         n_levels=4,
+#         n_heads=8,
+#         n_points=4
+#     ):
+#         """
+#         This version is inspired from DFine. We removed the following layers:
+#         -   value_proj
+#         -   output_proj
+#         """
+#         super().__init__()
+#         if d_model % n_heads != 0:
+#             raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+#         _d_per_head = d_model // n_heads
+#         # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+#         if not _is_power_of_2(_d_per_head):
+#             warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+#                           "which is more efficient in our CUDA implementation.")
+#         self.d_model = d_model
+#         self.n_levels = n_levels
+#         self.n_heads = n_heads
+#         if isinstance(n_points, list):
+#             assert len(n_points) == n_levels, ''
+#             num_points_list = n_points
+#         else:
+#             num_points_list = [n_points for _ in range(n_levels)]
+#         self.num_points_list = num_points_list
+#         self.total_num_points = sum(num_points_list)
+#         num_points_scale = [1/n for n in num_points_list]
+#         self.register_buffer('num_points_scale', torch.tensor(num_points_scale, dtype=torch.float32).reshape(-1, 1, 1))
+#         self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * 4)
+#         self.sampling_ratios = nn.Linear(d_model, n_heads * sum(num_points_list))
+#         self.attention_weights = nn.Linear(d_model, n_heads * sum(num_points_list))
+#         self._reset_parameters()
+#     def _reset_parameters(self):
+#         constant_(self.sampling_offsets.weight.data, 0.)
+#         thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+#         grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+#         grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, 2, 1)
+#         for i in range(1):
+#             grid_init[:, :, 2*i, :] *= i + 1
+#             grid_init[:, :, 2*i+1, :] *= i + 1
+#         with torch.no_grad():
+#             self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+#         constant_(self.attention_weights.weight.data, 0.)
+#         constant_(self.attention_weights.bias.data, 0.)
+#     def forward(self, query, reference_points, value, value_spatial_shapes):
+#         """
+#         :param query                       (N, Length_{query}, C)
+#         :param reference_points            (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+#         :param value               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+#         :param value_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+#         :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+#         :return output                     (N, Length_{query}, C)
+#         ####################################################################
+#         # Difference respect to MSDeformAttn
+#         # The query already stores the line's junctions
+#         # :param reference_points is not needed. We keep it to make both
+#                 MSDeformAttn and MSDeformLineAttn interchangebale
+#                 between different frameworks
+#         # MSDeformLineAttn does not generate offsets. Instead, it samples
+#                 n_points equally-spaced points from the line segment
+#         ####################################################################
+#         """
+#         N, Len_q, _ = query.shape
+#         sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, 1, 4)
+#         sampling_ratios = self.sampling_ratios(query).view(N, Len_q, self.n_heads, self.total_num_points).sigmoid()
+#         attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.total_num_points)
+#         attention_weights = F.softmax(attention_weights, -1)
+#         num_points_scale = self.num_points_scale.to(dtype=query.dtype)
+#         wh = reference_points[:, :, None, :, None, :2] - reference_points[:, :, None, :, None, 2:]
+#         center = 0.5 * (reference_points[:, :, None, :, None, :2] + reference_points[:, :, None, :, None, 2:])
+#         sampling_junctions = torch.cat((center, center), dim=-1) \
+#                                 + sampling_offsets * num_points_scale * torch.cat([wh, wh], -1) * 0.5
+#         sampling_locations = []
+#         for i, lambda_ in enumerate(torch.split(sampling_ratios, self.num_points_list, dim=-1)):
+#             lambda_ = lambda_[..., None]
+#             junctions = sampling_junctions[:, :, :, i]
+#             locations = junctions[..., :2] * lambda_ + junctions[..., 2:] * (1 - lambda_)
+#             sampling_locations.append(locations)
+#         output = ms_deform_attn_core_pytorchv2(
+#             value,
+#             value_spatial_shapes,
+#             sampling_locations,
+#             attention_weights,
+#             self.total_num_points
+#         )
+#         return output

linea/models/linea/criterion.py ADDED Viewed

	@@ -0,0 +1,517 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision.transforms.functional import resize
+from .utils import sigmoid_focal_loss
+from .matcher import build_matcher
+from .linea_utils import weighting_function, bbox2distance
+from ..registry import MODULE_BUILD_FUNCS
+# TODO. Quick solution to make the model run on GoogleColab
+import os, sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+from util.misc import get_world_size, is_dist_avail_and_initialized
+class LINEACriterion(nn.Module):
+    """ This class computes the loss for Conditional DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(self, num_classes, matcher, weight_dict, focal_alpha, losses):
+        """ Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+            focal_alpha: alpha in Focal Loss
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.losses = losses
+        self.focal_alpha = focal_alpha
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """Classification loss (Binary focal loss)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2]+1],
+                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+        target_classes_onehot = target_classes_onehot[:,:,:-1]
+        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]
+        losses = {'loss_logits': loss_ce}
+        return losses
+    def loss_lines(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_lines' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_lines = outputs['pred_lines'][idx]
+        target_lines = torch.cat([t['lines'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        loss_line = F.l1_loss(src_lines, target_lines, reduction='none')
+        losses = {}
+        losses['loss_line'] = loss_line.sum() / num_boxes
+        return losses
+    def loss_lmap(self, outputs, targets, indices, num_boxes):
+        losses = {}
+        if 'aux_lmap' in outputs:
+            src_lmap = outputs['aux_lmap']
+            size = src_lmap[0].size(2)
+            target_lmap = []
+            for t in targets:
+                lmaps_flatten = []
+                for lmap, downsampling in zip(t['lmap'], [1, 2, 4]):
+                    lmap_ = resize(lmap, (size//downsampling, size//downsampling))
+                    lmaps_flatten.append(lmap_.flatten(1))
+                target_lmap.append(torch.cat(lmaps_flatten, dim=1))
+            target_lmap = torch.cat(target_lmap, dim=0)
+            src_lmap = torch.cat([lmap_.flatten(1) for lmap_ in src_lmap], dim=1)
+            loss_lmap = F.binary_cross_entropy_with_logits(src_lmap, target_lmap, reduction='mean')
+            losses['loss_lmap'] = loss_lmap
+        return losses
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'lines': self.loss_lines,
+            'lmap': self.loss_lmap,
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+    def forward(self, outputs, targets, return_indices=False):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+             return_indices: used for vis. if True, the layer0-5 indices will be returned as well.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if 'aux' not in k}
+        device = next(iter(outputs.values())).device
+        indices = self.matcher(outputs_without_aux, targets)
+        if return_indices:
+            return indices
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=device)
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            indices_in = indices
+            num_boxes_in = num_boxes
+            l_dict = self.get_loss(loss, outputs, targets, indices_in, num_boxes_in)
+            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+            losses.update(l_dict)
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for idx, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f'_{idx}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        # interm_outputs loss
+        if 'aux_interm_outputs' in outputs:
+            interm_outputs = outputs['aux_interm_outputs']
+            indices = self.matcher(interm_outputs, targets)
+            for loss in self.losses:
+                l_dict = self.get_loss(loss, interm_outputs, targets, indices, num_boxes)
+                l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                l_dict = {k + f'_interm': v for k, v in l_dict.items()}
+                losses.update(l_dict)
+        # pre output loss
+        if 'aux_pre_outputs' in outputs:
+            pre_outputs = outputs['aux_pre_outputs']
+            indices = self.matcher(pre_outputs, targets)
+            for loss in self.losses:
+                l_dict = self.get_loss(loss, pre_outputs, targets, indices, num_boxes)
+                l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                l_dict = {k + f'_pre': v for k, v in l_dict.items()}
+                losses.update(l_dict)
+        # prepare for dn loss
+        dn_meta = outputs['dn_meta']
+        if self.training and dn_meta and 'aux_denoise' in outputs:
+            single_pad, scalar = self.prep_for_dn(dn_meta)
+            dn_pos_idx = []
+            dn_neg_idx = []
+            for i in range(len(targets)):
+                if len(targets[i]['labels']) > 0:
+                    t = torch.arange(len(targets[i]['labels'])).long().cuda()
+                    t = t.unsqueeze(0).repeat(scalar, 1)
+                    tgt_idx = t.flatten()
+                    output_idx = (torch.tensor(range(scalar)) * single_pad).long().cuda().unsqueeze(1) + t
+                    output_idx = output_idx.flatten()
+                else:
+                    output_idx = tgt_idx = torch.tensor([]).long().cuda()
+                dn_pos_idx.append((output_idx, tgt_idx))
+                dn_neg_idx.append((output_idx + single_pad // 2, tgt_idx))
+            dn_outputs = outputs['aux_denoise']
+            # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+            if 'aux_outputs' in dn_outputs:
+                for idx, aux_outputs in enumerate(dn_outputs['aux_outputs']):
+                    for loss in self.losses:
+                        l_dict = self.get_loss(loss, aux_outputs, targets, dn_pos_idx, num_boxes*scalar)
+                        l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                        l_dict = {k + f'_dn_{idx}': v for k, v in l_dict.items()}
+                        losses.update(l_dict)
+            if 'aux_pre_outputs' in dn_outputs:
+                aux_outputs_known = dn_outputs['aux_pre_outputs']
+                l_dict={}
+                for loss in self.losses:
+                    l_dict.update(self.get_loss(loss, aux_outputs_known, targets, dn_pos_idx, num_boxes*scalar))
+                l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                l_dict = {k + f'_pre_dn': v for k, v in l_dict.items()}
+                losses.update(l_dict)
+        losses = {k: v for k, v in sorted(losses.items(), key=lambda item: item[0])}
+        return losses
+    def prep_for_dn(self,dn_meta):
+        # output_known_lbs_lines = dn_meta['output_known_lbs_lines']
+        num_dn_groups, pad_size=dn_meta['num_dn_group'],dn_meta['pad_size']
+        assert pad_size % num_dn_groups==0
+        single_pad=pad_size//num_dn_groups
+        return single_pad,num_dn_groups
+class DFINESetCriterion(LINEACriterion):
+    def __init__(self, num_classes, matcher, weight_dict, focal_alpha, reg_max, losses):
+        super().__init__(num_classes, matcher, weight_dict, focal_alpha, losses)
+        self.reg_max = reg_max
+    def loss_local(self, outputs, targets, indices, num_boxes, T=5):
+        losses = {}
+        if 'pred_corners' in outputs:
+            idx = self._get_src_permutation_idx(indices)
+            target_lines = torch.cat([t['lines'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+            pred_corners = outputs['pred_corners'][idx].reshape(-1, (self.reg_max+1))
+            ref_points = outputs['ref_points'][idx].detach()
+            with torch.no_grad():
+                if self.fgl_targets_dn is None and 'is_dn' in outputs:
+                        self.fgl_targets_dn= bbox2distance(ref_points, target_lines,
+                                                        self.reg_max, outputs['reg_scale'],
+                                                        outputs['up'])
+                if self.fgl_targets is None and 'is_dn' not in outputs:
+                        self.fgl_targets = bbox2distance(ref_points, target_lines,
+                                                        self.reg_max, outputs['reg_scale'],
+                                                        outputs['up'])
+            target_corners, weight_right, weight_left = self.fgl_targets_dn if 'is_dn' in outputs else self.fgl_targets
+            losses['loss_fgl'] = self.unimodal_distribution_focal_loss(
+                pred_corners, target_corners, weight_right, weight_left, None, avg_factor=num_boxes)
+            if 'teacher_corners' in outputs:
+                pred_corners = outputs['pred_corners'].reshape(-1, (self.reg_max+1))
+                target_corners = outputs['teacher_corners'].reshape(-1, (self.reg_max+1))
+                if torch.equal(pred_corners, target_corners):
+                    losses['loss_ddf'] = pred_corners.sum() * 0
+                else:
+                    weight_targets_local = outputs['teacher_logits'].sigmoid().max(dim=-1)[0]
+                    mask = torch.zeros_like(weight_targets_local, dtype=torch.bool)
+                    mask[idx] = True
+                    mask = mask.unsqueeze(-1).repeat(1, 1, 4).reshape(-1)
+                    weight_targets_local = weight_targets_local.unsqueeze(-1).repeat(1, 1, 4).reshape(-1).detach()
+                    loss_match_local = weight_targets_local * (T ** 2) * (nn.KLDivLoss(reduction='none')
+                    (F.log_softmax(pred_corners / T, dim=1), F.softmax(target_corners.detach() / T, dim=1))).sum(-1)
+                    if 'is_dn' not in outputs:
+                        batch_scale = 8 / outputs['pred_lines'].shape[0]  # Avoid the influence of batch size per GPU
+                        self.num_pos, self.num_neg = (mask.sum() * batch_scale) ** 0.5, ((~mask).sum() * batch_scale) ** 0.5
+                    loss_match_local1 = loss_match_local[mask].mean() if mask.any() else 0
+                    loss_match_local2 = loss_match_local[~mask].mean() if (~mask).any() else 0
+                    losses['loss_ddf'] = (loss_match_local1 * self.num_pos + loss_match_local2 * self.num_neg) / (self.num_pos + self.num_neg)
+        return losses
+    def _clear_cache(self):
+        self.fgl_targets, self.fgl_targets_dn = None, None
+        self.own_targets, self.own_targets_dn = None, None
+        self.num_pos, self.num_neg = None, None
+    def unimodal_distribution_focal_loss(self, pred, label, weight_right, weight_left, weight=None, reduction='sum', avg_factor=None):
+        dis_left = label.long()
+        dis_right = dis_left + 1
+        loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left.reshape(-1) \
+             + F.cross_entropy(pred, dis_right, reduction='none') * weight_right.reshape(-1)
+        if weight is not None:
+            weight = weight.float()
+            loss = loss * weight
+        if avg_factor is not None:
+            loss = loss.sum() / avg_factor
+        elif reduction == 'mean':
+            loss = loss.mean()
+        elif reduction == 'sum':
+            loss = loss.sum()
+        return loss
+    def _get_go_indices(self, indices, indices_aux_list):
+        """Get a matching union set across all decoder layers. """
+        results = []
+        for indices_aux in indices_aux_list:
+            indices = [(torch.cat([idx1[0], idx2[0]]), torch.cat([idx1[1], idx2[1]]))
+                        for idx1, idx2 in zip(indices.copy(), indices_aux.copy())]
+        for ind in [torch.cat([idx[0][:, None], idx[1][:, None]], 1) for idx in indices]:
+            unique, counts = torch.unique(ind, return_counts=True, dim=0)
+            count_sort_indices = torch.argsort(counts, descending=True)
+            unique_sorted = unique[count_sort_indices]
+            column_to_row = {}
+            for idx in unique_sorted:
+                row_idx, col_idx = idx[0].item(), idx[1].item()
+                if row_idx not in column_to_row:
+                    column_to_row[row_idx] = col_idx
+            final_rows = torch.tensor(list(column_to_row.keys()), device=ind.device)
+            final_cols = torch.tensor(list(column_to_row.values()), device=ind.device)
+            results.append((final_rows.long(), final_cols.long()))
+        return results
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'lines': self.loss_lines,
+            'lmap': self.loss_lmap,
+            'local': self.loss_local,
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+    def forward(self, outputs, targets):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+             return_indices: used for vis. if True, the layer0-5 indices will be returned as well.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if 'aux' not in k}
+        device = next(iter(outputs.values())).device
+        indices = self.matcher(outputs_without_aux, targets)
+        self._clear_cache()
+        # Get the matching union set across all decoder layers.
+        if 'aux_outputs' in outputs:
+            indices_aux_list, cached_indices, cached_indices_enc = [], [], []
+            for i, aux_outputs in enumerate(outputs['aux_outputs'] + [outputs['aux_pre_outputs']]):
+                indices_aux = self.matcher(aux_outputs, targets)
+                cached_indices.append(indices_aux)
+                indices_aux_list.append(indices_aux)
+            for i, aux_outputs in enumerate([outputs['aux_interm_outputs']]):
+                indices_enc = self.matcher(aux_outputs, targets)
+                cached_indices_enc.append(indices_enc)
+                indices_aux_list.append(indices_enc)
+            indices_go = self._get_go_indices(indices, indices_aux_list)
+            num_boxes_go = sum(len(x[0]) for x in indices_go)
+            num_boxes_go = torch.as_tensor([num_boxes_go], dtype=torch.float, device=next(iter(outputs.values())).device)
+            if is_dist_avail_and_initialized():
+                torch.distributed.all_reduce(num_boxes_go)
+            num_boxes_go = torch.clamp(num_boxes_go / get_world_size(), min=1).item()
+        else:
+            # assert 'aux_outputs' in outputs, ''
+            indices_go = indices
+            num_boxes_go = sum(len(x[0]) for x in indices_go)
+            num_boxes_go = torch.as_tensor([num_boxes_go], dtype=torch.float, device=next(iter(outputs.values())).device)
+            if is_dist_avail_and_initialized():
+                torch.distributed.all_reduce(num_boxes_go)
+            num_boxes_go = torch.clamp(num_boxes_go / get_world_size(), min=1).item()
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=device)
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            indices_in = indices_go if loss in ['lines', 'local'] else indices
+            num_boxes_in = num_boxes_go if loss in ['lines', 'local'] else num_boxes
+            l_dict = self.get_loss(loss, outputs, targets, indices_in, num_boxes_in)
+            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+            losses.update(l_dict)
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for idx, aux_outputs in enumerate(outputs['aux_outputs']):
+                aux_outputs['up'], aux_outputs['reg_scale'] = outputs['up'], outputs['reg_scale']
+                # indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    indices_in = indices_go if loss in ['lines', 'local'] else cached_indices[idx]
+                    num_boxes_in = num_boxes_go if loss in ['lines', 'local'] else num_boxes
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices_in, num_boxes_in)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f'_{idx}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        # interm_outputs loss
+        if 'aux_interm_outputs' in outputs:
+            interm_outputs = outputs['aux_interm_outputs']
+            # indices = self.matcher(interm_outputs, targets)
+            for loss in self.losses:
+                indices_in = indices_go if loss in ['lines', 'local'] else cached_indices_enc[0]
+                num_boxes_in = num_boxes_go if loss in ['lines', 'local'] else num_boxes
+                l_dict = self.get_loss(loss, interm_outputs, targets, indices_in, num_boxes_in)
+                l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                l_dict = {k + f'_interm': v for k, v in l_dict.items()}
+                losses.update(l_dict)
+        # pre output loss
+        if 'aux_pre_outputs' in outputs:
+            pre_outputs = outputs['aux_pre_outputs']
+            # indices = self.matcher(pre_outputs, targets)
+            for loss in self.losses:
+                indices_in = indices_go if loss in ['lines', 'local'] else cached_indices[-1]
+                num_boxes_in = num_boxes_go if loss in ['lines', 'local'] else num_boxes
+                l_dict = self.get_loss(loss, pre_outputs, targets, indices_in, num_boxes_in)
+                l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                l_dict = {k + f'_pre': v for k, v in l_dict.items()}
+                losses.update(l_dict)
+        # prepare for dn loss
+        dn_meta = outputs['dn_meta']
+        if self.training and dn_meta and 'aux_denoise' in outputs:
+            single_pad, scalar = self.prep_for_dn(dn_meta)
+            dn_pos_idx = []
+            dn_neg_idx = []
+            for i in range(len(targets)):
+                if len(targets[i]['labels']) > 0:
+                    t = torch.arange(len(targets[i]['labels'])).long().cuda()
+                    t = t.unsqueeze(0).repeat(scalar, 1)
+                    tgt_idx = t.flatten()
+                    output_idx = (torch.tensor(range(scalar)) * single_pad).long().cuda().unsqueeze(1) + t
+                    output_idx = output_idx.flatten()
+                else:
+                    output_idx = tgt_idx = torch.tensor([]).long().cuda()
+                dn_pos_idx.append((output_idx, tgt_idx))
+                dn_neg_idx.append((output_idx + single_pad // 2, tgt_idx))
+            dn_outputs = outputs['aux_denoise']
+            # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+            if 'aux_outputs' in dn_outputs:
+                for idx, aux_outputs in enumerate(dn_outputs['aux_outputs']):
+                    aux_outputs['is_dn'] = True
+                    aux_outputs['reg_scale'] = outputs['reg_scale']
+                    aux_outputs['up'] = outputs['up']
+                    # indices = self.matcher(aux_outputs, targets)
+                    for loss in self.losses:
+                        l_dict = self.get_loss(loss, aux_outputs, targets, dn_pos_idx, num_boxes*scalar)
+                        l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                        l_dict = {k + f'_dn_{idx}': v for k, v in l_dict.items()}
+                        losses.update(l_dict)
+            if 'aux_pre_outputs' in dn_outputs:
+                aux_outputs_known = dn_outputs['aux_pre_outputs']
+                l_dict={}
+                for loss in self.losses:
+                    l_dict.update(self.get_loss(loss, aux_outputs_known, targets, dn_pos_idx, num_boxes*scalar))
+                l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                l_dict = {k + f'_pre_dn': v for k, v in l_dict.items()}
+                losses.update(l_dict)
+        if 'aux_lmap' in outputs:
+            l_dict = self.get_loss('lmap', outputs, targets, indices, num_boxes, **kwargs)
+            l_dict = {k: v for k, v in l_dict.items()}
+            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+            losses.update(l_dict)
+        losses = {k: v for k, v in sorted(losses.items(), key=lambda item: item[0])}
+        return losses
+@MODULE_BUILD_FUNCS.registe_with_name(module_name='LINEACRITERION')
+def build_criterion(args):
+    num_classes = args.num_classes
+    matcher = build_matcher(args)
+    if args.criterion_type == 'default':
+        criterion = LINEACriterion(num_classes, matcher=matcher, weight_dict=args.weight_dict,
+                             focal_alpha=args.focal_alpha, losses=args.losses)
+    elif args.criterion_type == 'dfine':
+        criterion = DFINESetCriterion(num_classes, matcher=matcher, weight_dict=args.weight_dict,
+                             focal_alpha=args.focal_alpha, reg_max=args.reg_max, losses=args.losses)
+    else:
+        raise Exception(f"Criterion type: {args.criterion_type}.We only support two classes: 'default' and 'dfine'. ")
+    return criterion

linea/models/linea/decoder.py ADDED Viewed

	@@ -0,0 +1,551 @@

+"""
+Modified from D-FINE (https://github.com/Peterande/D-FINE)
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright (c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import copy
+import math
+from typing import Optional
+from collections import OrderedDict
+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+import torch.nn.init as init
+from .utils import gen_encoder_output_proposals, MLP, _get_activation_fn, gen_sineembed_for_position
+from .attention_mechanism import MSDeformAttn
+from .attention_mechanism import MSDeformLineAttn
+from .dn_components import prepare_for_cdn, dn_post_process
+from .linea_utils import weighting_function, distance2bbox, inverse_sigmoid
+def _get_clones(module, N, layer_share=False):
+    if layer_share:
+        return nn.ModuleList([module for i in range(N)])
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+class DeformableTransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 ):
+        super().__init__()
+        # cross attention
+        self.cross_attn = MSDeformLineAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
+                # for memory
+                memory: Optional[Tensor] = None, # hw, bs, d_modelmemory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
+                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None, # pos for memory
+                # sa
+                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
+            ):
+        # self attention
+        q = k = self.with_pos_embed(tgt, tgt_query_pos)
+        tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        # cross attention
+        tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+                               tgt_reference_points.transpose(0, 1).contiguous(),
+                               memory, #.transpose(0, 1),
+                               memory_spatial_shapes,
+                               ).transpose(0, 1)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        # feed forward network
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+class Integral(nn.Module):
+    """
+    A static layer that calculates integral results from a distribution.
+    This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`,
+    where Pr(n) is the softmax probability vector representing the discrete
+    distribution, and W(n) is the non-uniform Weighting Function.
+    Args:
+        reg_max (int): Max number of the discrete bins. Default is 32.
+                       It can be adjusted based on the dataset or task requirements.
+    """
+    def __init__(self, reg_max=32):
+        super(Integral, self).__init__()
+        self.reg_max = reg_max
+    def forward(self, x, project):
+        shape = x.shape
+        x = F.softmax(x.reshape(-1, self.reg_max + 1), dim=1)
+        x = F.linear(x, project.to(x.device)).reshape(-1, 4)
+        return x.reshape(list(shape[:-1]) + [-1])
+class LQE(nn.Module):
+    def __init__(self, k, hidden_dim, num_layers, reg_max):
+        super(LQE, self).__init__()
+        self.k = k
+        self.reg_max = reg_max
+        self.reg_conf = MLP(4 * (k + 1), hidden_dim, 1, num_layers)
+        init.constant_(self.reg_conf.layers[-1].bias, 0)
+        init.constant_(self.reg_conf.layers[-1].weight, 0)
+    def forward(self, scores, pred_corners):
+        B, L, _ = pred_corners.size()
+        prob = F.softmax(pred_corners.reshape(B, L, 4, self.reg_max+1), dim=-1)
+        prob_topk, _ = prob.topk(self.k, dim=-1)
+        stat = torch.cat([prob_topk, prob_topk.mean(dim=-1, keepdim=True)], dim=-1)
+        quality_score = self.reg_conf(stat.reshape(B, L, -1))
+        return scores + quality_score
+class TransformerDecoder(nn.Module):
+    def __init__(
+    	self,
+    	decoder_layer,
+    	num_layers,
+    	norm=None,
+        d_model=256,
+        query_dim=4,
+        num_feature_levels=1,
+        aux_loss=False,
+        eval_idx=5,
+        # from D-FINE
+        reg_max=32,
+        reg_scale=4,
+        ):
+        super().__init__()
+        if num_layers > 0:
+            self.layers = _get_clones(decoder_layer, num_layers)
+        else:
+            self.layers = []
+        self.num_layers = num_layers
+        # self.norm = norm
+        self.query_dim = query_dim
+        self.num_feature_levels = num_feature_levels
+        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
+        self.reg_max = reg_max
+        self.up = nn.Parameter(torch.tensor([0.5]), requires_grad=False)
+        self.reg_scale = nn.Parameter(torch.tensor([reg_scale]), requires_grad=False)
+        self.d_model = d_model
+        # prediction layers
+        _class_embed = nn.Linear(d_model, 2)
+        _enc_bbox_embed = MLP(d_model, d_model, 4, 3)
+        # init the two embed layers
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        _class_embed.bias.data = torch.ones(2) * bias_value
+        nn.init.constant_(_enc_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_enc_bbox_embed.layers[-1].bias.data, 0)
+        _bbox_embed = MLP(d_model, d_model, 4 * (self.reg_max + 1), 3)
+        self.bbox_embed = nn.ModuleList([copy.deepcopy(_bbox_embed) for i in range(num_layers)])
+        self.class_embed = nn.ModuleList([copy.deepcopy(_class_embed) for i in range(num_layers)])
+        self.lqe_layers = nn.ModuleList([copy.deepcopy(LQE(4, 64, 2, reg_max)) for _ in range(num_layers)])
+        self.integral = Integral(self.reg_max)
+        # two stage
+        self.enc_out_bbox_embed = copy.deepcopy(_enc_bbox_embed)
+        # self.enc_out_class_embed = copy.deepcopy(_class_embed)
+        self.aux_loss = aux_loss
+        # inference
+        self.eval_idx = eval_idx
+    def forward(self,
+    	tgt,
+    	memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        refpoints_unsigmoid: Optional[Tensor] = None, # num_queries, bs, 2
+        # for memory
+        spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
+        ):
+        """
+        Input:
+            - tgt: nq, bs, d_model
+            - memory: hw, bs, d_model
+            - pos: hw, bs, d_model
+            - refpoints_unsigmoid: nq, bs, 2/4
+        """
+        output = tgt
+        output_detach = pred_corners_undetach = 0
+        intermediate = []
+        ref_points_detach = refpoints_unsigmoid.sigmoid()
+        dec_out_bboxes = []
+        dec_out_logits = []
+        dec_out_corners = []
+        dec_out_refs = []
+        if not hasattr(self, 'project'):
+            project = weighting_function(self.reg_max, self.up, self.reg_scale)
+        else:
+            project = self.project
+        for layer_id, layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach[:, :, None]  # nq, bs, nlevel, 4
+            query_sine_embed = gen_sineembed_for_position(ref_points_input[:, :, 0, :], self.d_model) # nq, bs, 256*2
+            query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256
+            output = layer(
+                tgt = output,
+                tgt_query_pos = query_pos,
+                tgt_query_sine_embed = query_sine_embed,
+                tgt_key_padding_mask = tgt_key_padding_mask,
+                tgt_reference_points = ref_points_input,
+                memory = memory,
+                memory_spatial_shapes = spatial_shapes,
+                memory_pos = pos,
+                self_attn_mask = tgt_mask,
+                cross_attn_mask = memory_mask
+            )
+            if layer_id == 0:
+            	pre_bboxes = torch.sigmoid(self.enc_out_bbox_embed(output) + inverse_sigmoid(ref_points_detach))
+            	pre_scores = self.class_embed[0](output)
+            	ref_points_initial = pre_bboxes.detach()
+            pred_corners = self.bbox_embed[layer_id](output + output_detach) + pred_corners_undetach
+            inter_ref_bbox = distance2bbox(ref_points_initial, self.integral(pred_corners, project), self.reg_scale)
+            if self.training or layer_id == self.eval_idx:
+            	scores = self.class_embed[layer_id](output)
+            	scores = self.lqe_layers[layer_id](scores, pred_corners)
+            	dec_out_logits.append(scores)
+            	dec_out_bboxes.append(inter_ref_bbox)
+            	dec_out_corners.append(pred_corners)
+            	dec_out_refs.append(ref_points_initial)
+            pred_corners_undetach = pred_corners
+            if self.training:
+            	ref_points_detach = inter_ref_bbox.detach()
+            	output_detach = output.detach()
+            else:
+            	ref_points_detach = inter_ref_bbox
+            	output_detach = output
+        return torch.stack(dec_out_bboxes).permute(0, 2, 1, 3), torch.stack(dec_out_logits).permute(0, 2, 1, 3), \
+                pre_bboxes, pre_scores
+class LINEATransformer(nn.Module):
+    def __init__(
+        self,
+        feat_channels=[256, 256, 256],
+        feat_strides=[8, 16, 32],
+        d_model=256,
+        num_classes=2,
+        nhead=8,
+        num_queries=300,
+        num_decoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.0,
+        activation="relu",
+        normalize_before=False,
+        query_dim=4,
+        aux_loss=False,
+        # for deformable encoder
+        num_feature_levels=1,
+        dec_n_points=4,
+        # from D-FINE
+        reg_max=32,
+        reg_scale=4,
+        # denoising
+        dn_number=100,
+        dn_label_noise_ratio=0.5,
+        dn_line_noise_scale=0.5,
+        # for inference
+        eval_spatial_size=None,
+        eval_idx=5
+        ):
+        super().__init__()
+        # init learnable queries
+        self.tgt_embed = nn.Embedding(num_queries, d_model)
+        nn.init.normal_(self.tgt_embed.weight.data)
+        # line segment detection parameters
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        # anchor selection at the output of encoder
+        self.enc_output = nn.Linear(d_model, d_model)
+        self.enc_output_norm = nn.LayerNorm(d_model)
+        self._reset_parameters()
+        # prediction layers
+        _class_embed = nn.Linear(d_model, num_classes)
+        _bbox_embed = MLP(d_model, d_model, 4, 3)
+        # init the two embed layers
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        _class_embed.bias.data = torch.ones(2) * bias_value
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
+        self.enc_out_bbox_embed  = copy.deepcopy(_bbox_embed)
+        self.enc_out_class_embed  = copy.deepcopy(_class_embed)
+        # decoder parameters
+        self.d_model = d_model
+        self.n_heads = nhead
+        decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
+                                                          dropout, activation,
+                                                          num_feature_levels, nhead, dec_n_points)
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
+                                        d_model=d_model, query_dim=query_dim,
+                                        num_feature_levels=num_feature_levels,
+                                        eval_idx=eval_idx, aux_loss=aux_loss,
+                                        reg_max=reg_max, reg_scale=reg_scale)
+        # for inference mode
+        self.eval_spatial_size = eval_spatial_size
+        if eval_spatial_size is not None:
+            spatial_shapes = [[int(self.eval_spatial_size[0] / s), int(self.eval_spatial_size[1] / s)]
+                for s in feat_strides
+            ]
+            output_proposals, output_proposals_valid = self.generate_anchors(spatial_shapes)
+            self.register_buffer('output_proposals', output_proposals)
+            self.register_buffer('output_proposals_mask', ~output_proposals_valid)
+        # denoising parameters
+        self.dn_number = dn_number
+        self.dn_label_noise_ratio = dn_label_noise_ratio
+        self.dn_line_noise_scale = dn_line_noise_scale
+        self.label_enc = nn.Embedding(90 + 1, d_model)
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn): # or isinstance(m, MSDeformLineAttn):
+                m._reset_parameters()
+    def generate_anchors(self, spatial_shapes):
+        proposals = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32),
+                                            torch.linspace(0, W_ - 1, W_, dtype=torch.float32), indexing='ij')
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2
+            scale = torch.tensor([W_, H_], dtype=torch.float32,).view(1, 1, 1, 2)
+            grid = (grid.unsqueeze(0) + 0.5) / scale
+            proposal = torch.cat((grid, grid), -1).view(1, -1, 4)
+            proposals.append(proposal)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
+        return output_proposals, output_proposals_valid
+    def forward(self, feats, targets):
+        # flatten feature maps
+        memory = []
+        spatial_shapes = []
+        split_sizes = []
+        for feat in feats:
+            bs, c, h, w = feat.shape
+            memory.append(feat.flatten(2).permute(0, 2, 1))
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            split_sizes.append(h*w)
+        # spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=feats[0].device)
+        memory = torch.cat(memory, 1) # bs, \sum{hxw}, c
+        # two-stage
+        if self.training:
+            output_memory, output_proposals = gen_encoder_output_proposals(memory, spatial_shapes)
+        else:
+            output_proposals = self.output_proposals.repeat(bs, 1, 1)
+            output_memory = memory.masked_fill(self.output_proposals_mask, float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)
+        enc_outputs_coord_unselected = self.enc_out_bbox_embed(output_memory) + output_proposals # (bs, \sum{hw}, 4) unsigmoid
+        topk = self.num_queries
+        topk_proposals = torch.topk(enc_outputs_class_unselected.max(-1)[0], topk, dim=1)[1] # bs, nq
+        # gather boxes
+        refpoint_embed_undetach = torch.gather(enc_outputs_coord_unselected, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) ) # unsigmoid
+        refpoint_embed = refpoint_embed_undetach.detach()
+        init_box_proposal = torch.gather(output_proposals, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)).sigmoid() # sigmoid
+        # gather tgt
+        tgt_undetach = torch.gather(output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
+        tgt = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) # nq, bs, d_model
+        # denoise (only for training)
+        if self.training and targets is not None:
+            dn_tgt, dn_refpoint_embed, dn_attn_mask, dn_meta =\
+                prepare_for_cdn(dn_args=(targets, self.dn_number, self.dn_label_noise_ratio, self.dn_line_noise_scale),
+                                training=self.training,num_queries=self.num_queries, num_classes=self.num_classes,
+                                hidden_dim=self.d_model, label_enc=self.label_enc)
+            tgt = torch.cat([dn_tgt, tgt], dim=1)
+            refpoint_embed = torch.cat([dn_refpoint_embed, refpoint_embed], dim=1)
+        else:
+            dn_attn_mask = dn_meta = None
+        # preprocess memory for MSDeformableLineAttention
+        value = memory.unflatten(2, (self.n_heads, -1)) # (bs, \sum{hxw}, n_heads, d_model//n_heads)
+        value = value.permute(0, 2, 3, 1).flatten(0, 1).split(split_sizes, dim=-1)
+        out_coords, out_class , pre_coords, pre_class = self.decoder(
+                tgt=tgt.transpose(0, 1),
+                memory=value, #memory.transpose(0, 1),
+                pos=None,
+                refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
+                spatial_shapes=spatial_shapes,
+                tgt_mask=dn_attn_mask)
+        # output
+        if self.training:
+            pre_coords = pre_coords.permute(1, 0, 2)
+            pre_class = pre_class.permute(1, 0, 2)
+            if dn_meta is not None:
+                dn_out_coords, out_coords = torch.split(out_coords, [dn_meta['pad_size'], self.num_queries], dim=2)
+                dn_out_class, out_class = torch.split(out_class, [dn_meta['pad_size'], self.num_queries], dim=2)
+                dn_pre_coords, pre_coords = torch.split(pre_coords, [dn_meta['pad_size'], self.num_queries], dim=1)
+                dn_pre_class, pre_class = torch.split(pre_class, [dn_meta['pad_size'], self.num_queries], dim=1)
+            out = {'pred_logits': out_class[-1], 'pred_lines': out_coords[-1]}
+            out['aux_pre_outputs'] = {'pred_logits': pre_class, 'pred_lines': pre_coords}
+            if self.decoder.aux_loss:
+                out['aux_outputs'] = self._set_aux_loss(out_class[:-1], out_coords[:-1])
+            # for encoder output
+            out_coords_enc = refpoint_embed_undetach.sigmoid()
+            out_class_enc = self.enc_out_class_embed(tgt_undetach)
+            out['aux_interm_outputs'] = {'pred_logits': out_class_enc, 'pred_lines': out_coords_enc}
+            if dn_meta is not None:
+                dn_out = {}
+                dn_out['aux_outputs'] = self._set_aux_loss(dn_out_class, dn_out_coords)
+                dn_out['aux_pre_outputs'] = {'pred_logits': dn_pre_class, 'pred_lines': dn_pre_coords}
+                out['aux_denoise'] = dn_out
+        else:
+            out = {'pred_logits': out_class[0], 'pred_lines': out_coords[0]}
+        out['dn_meta'] = dn_meta
+        return out
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_logits': a, 'pred_lines': b}
+                for a, b in zip(outputs_class, outputs_coord)]
+    @torch.jit.unused
+    def _set_aux_loss2(self, outputs_class, outputs_coord, outputs_corners, outputs_ref,
+                      teacher_corners=None, teacher_class=None):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_logits': a, 'pred_lines': b, 'pred_corners': c, 'ref_points': d,
+                     'teacher_corners': teacher_corners, 'teacher_logits': teacher_class}
+                for a, b, c, d in zip(outputs_class, outputs_coord, outputs_corners, outputs_ref)]
+def build_decoder(args):
+    return LINEATransformer(
+            feat_channels = args.feat_channels_decoder,
+            feat_strides=args.feat_strides,
+            num_classes=args.num_classes,
+            d_model=args.hidden_dim,
+            nhead=args.nheads,
+            num_queries=args.num_queries,
+            num_decoder_layers=args.dec_layers,
+            dim_feedforward=args.dim_feedforward,
+            dropout=args.dropout,
+            activation=args.transformer_activation,
+            normalize_before=args.pre_norm,
+            query_dim=args.query_dim,
+            aux_loss=True,
+            # for deformable encoder
+            num_feature_levels=args.num_feature_levels,
+            dec_n_points=args.dec_n_points,
+            # for D-FINE layers
+            reg_max=args.reg_max,
+            reg_scale=args.reg_scale,
+            # for inference
+            eval_spatial_size=args.eval_spatial_size,
+            eval_idx=args.eval_idx,
+            # for denoising
+            dn_number=args.dn_number,
+            dn_label_noise_ratio=args.dn_label_noise_ratio,
+            dn_line_noise_scale=args.dn_line_noise_scale,
+            )

linea/models/linea/dn_components.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# DN-DETR
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+import torch
+from .linea_utils import inverse_sigmoid
+import torch.nn.functional as F
+def prepare_for_cdn(dn_args, training, num_queries, num_classes, hidden_dim, label_enc):
+    """
+        A major difference of DINO from DN-DETR is that the author process pattern embedding pattern embedding in its detector
+        forward function and use learnable tgt embedding, so we change this function a little bit.
+        :param dn_args: targets, dn_number, label_noise_ratio, box_noise_scale
+        :param training: if it is training or inference
+        :param num_queries: number of queires
+        :param num_classes: number of classes
+        :param hidden_dim: transformer hidden dim
+        :param label_enc: encode labels in dn
+        :return:
+        """
+    if training:
+        targets, dn_number, label_noise_ratio, box_noise_scale = dn_args
+        # positive and negative dn queries
+        dn_number = dn_number * 2
+        known = [(torch.ones_like(t['labels'])).cuda() for t in targets]
+        batch_size = len(known)
+        known_num = [sum(k) for k in known]
+        if int(max(known_num)) == 0:
+            dn_number = 1
+        else:
+            if dn_number >= 100:
+                dn_number = dn_number // (int(max(known_num) * 2))
+            elif dn_number < 1:
+                dn_number = 1
+        if dn_number == 0:
+            dn_number = 1
+        unmask_bbox = unmask_label = torch.cat(known)
+        labels = torch.cat([t['labels'] for t in targets])
+        lines = torch.cat([t['lines'] for t in targets])
+        batch_idx = torch.cat([torch.full_like(t['labels'].long(), i) for i, t in enumerate(targets)])
+        known_indice = torch.nonzero(unmask_label + unmask_bbox)
+        known_indice = known_indice.view(-1)
+        known_indice = known_indice.repeat(2 * dn_number, 1).view(-1)
+        known_labels = labels.repeat(2 * dn_number, 1).view(-1)
+        known_bid = batch_idx.repeat(2 * dn_number, 1).view(-1)
+        known_lines = lines.repeat(2 * dn_number, 1)
+        known_labels_expaned = known_labels.clone()
+        known_lines_expand = known_lines.clone()
+        if label_noise_ratio > 0:
+            p = torch.rand_like(known_labels_expaned.float())
+            chosen_indice = torch.nonzero(p < (label_noise_ratio * 0.5)).view(-1)  # half of bbox prob
+            new_label = torch.randint_like(chosen_indice, 0, num_classes)  # randomly put a new one here
+            known_labels_expaned.scatter_(0, chosen_indice, new_label)
+        single_pad = int(max(known_num))
+        pad_size = int(single_pad * 2 * dn_number)
+        positive_idx = torch.tensor(range(len(lines))).long().cuda().unsqueeze(0).repeat(dn_number, 1)
+        positive_idx += (torch.tensor(range(dn_number)) * len(lines) * 2).long().cuda().unsqueeze(1)
+        positive_idx = positive_idx.flatten()
+        negative_idx = positive_idx + len(lines)
+        known_lines_ = known_lines.clone()
+        known_lines_[:, :2] = (known_lines[:, :2] - known_lines[:, 2:]) / 2
+        known_lines_[:, 2:] = (known_lines[:, :2] + known_lines[:, 2:]) / 2
+        centers = torch.zeros_like(known_lines)
+        centers[:, :2] = (known_lines_[:, :2] + known_lines_[:, 2:]) / 2
+        centers[:, 2:] = (known_lines_[:, :2] + known_lines_[:, 2:]) / 2
+        # Noisy length
+        diff = torch.zeros_like(known_lines)
+        diff[:, :2] = (known_lines[:, 2:] -  known_lines[:, :2]) / 2
+        diff[:, 2:] = (known_lines[:, 2:] -  known_lines[:, :2]) / 2
+        rand_sign = torch.randint(low=0, high=2, size=(known_lines.shape[0], 2), dtype=torch.float32, device=known_lines.device) * 2.0 - 1.0
+        rand_part = torch.rand(size=(known_lines.shape[0], 2), device=known_lines.device)
+        rand_part[negative_idx] += 1.2
+        rand_part *= rand_sign
+        known_lines_ = centers + torch.mul(rand_part.repeat_interleave(2, 1),
+                                              diff).cuda() * box_noise_scale
+        known_lines_expand = known_lines_.clamp(min=0.0, max=1.0)
+        # order: top point > bottom point
+        #        if same y coordinate, right point > left point
+        idx = torch.logical_or(known_lines_expand[..., 0] > known_lines_expand[..., 2],
+                torch.logical_or(
+                known_lines_expand[..., 0] == known_lines_expand[..., 2],
+                known_lines_expand[..., 1] < known_lines_expand[..., 3]
+                )
+            )
+        known_lines_expand[idx] = known_lines_expand[idx][:, [2, 3, 0, 1]]
+        m = known_labels_expaned.long().to('cuda')
+        input_label_embed = label_enc(m)
+        input_lines_embed = inverse_sigmoid(known_lines_expand)
+        padding_label = torch.zeros(pad_size, hidden_dim).cuda()
+        padding_lines = torch.zeros(pad_size, 4).cuda()
+        input_query_label = padding_label.repeat(batch_size, 1, 1)
+        input_query_lines = padding_lines.repeat(batch_size, 1, 1)
+        map_known_indice = torch.tensor([]).to('cuda')
+        if len(known_num):
+            map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num])  # [1,2, 1,2,3]
+            map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(2 * dn_number)]).long()
+        if len(known_bid):
+            input_query_label[(known_bid.long(), map_known_indice)] = input_label_embed
+            input_query_lines[(known_bid.long(), map_known_indice)] = input_lines_embed
+        tgt_size = pad_size + num_queries
+        attn_mask = torch.ones(tgt_size, tgt_size).to('cuda') < 0
+        # match query cannot see the reconstruct
+        attn_mask[pad_size:, :pad_size] = True
+        # reconstruct cannot see each other
+        for i in range(dn_number):
+            if i == 0:
+                attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True
+            if i == dn_number - 1:
+                attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * i * 2] = True
+            else:
+                attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True
+                attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * 2 * i] = True
+        dn_meta = {
+            'pad_size': pad_size,
+            'num_dn_group': dn_number,
+        }
+    else:
+        input_query_label = None
+        input_query_lines = None
+        attn_mask = None
+        dn_meta = None
+    return input_query_label, input_query_lines, attn_mask, dn_meta
+def dn_post_process(outputs_class, outputs_coord, dn_meta, aux_loss, _set_aux_loss):
+    """
+        post process of dn after output from the transformer
+        put the dn part in the dn_meta
+    """
+    if dn_meta and dn_meta['pad_size'] > 0:
+        output_known_class = outputs_class[:, :, :dn_meta['pad_size'], :]
+        output_known_coord = outputs_coord[:, :, :dn_meta['pad_size'], :]
+        outputs_class = outputs_class[:, :, dn_meta['pad_size']:, :]
+        outputs_coord = outputs_coord[:, :, dn_meta['pad_size']:, :]
+        # print(output_known_class.shape, outputs_class.shape)
+        # quit()
+        out = {'pred_logits': output_known_class[-1], 'pred_lines': output_known_coord[-1]}
+        if aux_loss:
+            out['aux_outputs'] = _set_aux_loss(output_known_class[1:], output_known_coord[1:])
+            out['pre_outputs'] = {'pred_logits':output_known_class[0], 'pred_lines': output_known_coord[0]}
+        dn_meta['output_known_lbs_lines'] = out
+    return outputs_class, outputs_coord

linea/models/linea/hgnetv2.py ADDED Viewed

	@@ -0,0 +1,595 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+import logging
+# Constants for initialization
+kaiming_normal_ = nn.init.kaiming_normal_
+zeros_ = nn.init.zeros_
+ones_ = nn.init.ones_
+class FrozenBatchNorm2d(nn.Module):
+    """copy and modified from https://github.com/facebookresearch/detr/blob/master/models/backbone.py
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, num_features, eps=1e-5):
+        super(FrozenBatchNorm2d, self).__init__()
+        n = num_features
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+        self.eps = eps
+        self.num_features = n
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        scale = w * (rv + self.eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+    def extra_repr(self):
+        return (
+            "{num_features}, eps={eps}".format(**self.__dict__)
+        )
+class LearnableAffineBlock(nn.Module):
+    def __init__(
+            self,
+            scale_value=1.0,
+            bias_value=0.0
+    ):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor([scale_value]), requires_grad=True)
+        self.bias = nn.Parameter(torch.tensor([bias_value]), requires_grad=True)
+    def forward(self, x):
+        return self.scale * x + self.bias
+class ConvBNAct(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            kernel_size,
+            stride=1,
+            groups=1,
+            padding='',
+            use_act=True,
+            use_lab=False
+    ):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        if padding == 'same':
+            self.conv = nn.Sequential(
+                nn.ZeroPad2d([0, 1, 0, 1]),
+                nn.Conv2d(
+                    in_chs,
+                    out_chs,
+                    kernel_size,
+                    stride,
+                    groups=groups,
+                    bias=False
+                )
+            )
+        else:
+            self.conv = nn.Conv2d(
+                in_chs,
+                out_chs,
+                kernel_size,
+                stride,
+                padding=(kernel_size - 1) // 2,
+                groups=groups,
+                bias=False
+            )
+        self.bn = nn.BatchNorm2d(out_chs)
+        if self.use_act:
+            self.act = nn.ReLU()
+        else:
+            self.act = nn.Identity()
+        if self.use_act and self.use_lab:
+            self.lab = LearnableAffineBlock()
+        else:
+            self.lab = nn.Identity()
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        x = self.lab(x)
+        return x
+class LightConvBNAct(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            kernel_size,
+            groups=1,
+            use_lab=False,
+    ):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_chs,
+            out_chs,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+        )
+        self.conv2 = ConvBNAct(
+            out_chs,
+            out_chs,
+            kernel_size=kernel_size,
+            groups=out_chs,
+            use_act=True,
+            use_lab=use_lab,
+        )
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+class StemBlock(nn.Module):
+    # for HGNetv2
+    def __init__(self, in_chs, mid_chs, out_chs, use_lab=False):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_chs,
+            mid_chs,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+        )
+        self.stem2a = ConvBNAct(
+            mid_chs,
+            mid_chs // 2,
+            kernel_size=2,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.stem2b = ConvBNAct(
+            mid_chs // 2,
+            mid_chs,
+            kernel_size=2,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.stem3 = ConvBNAct(
+            mid_chs * 2,
+            mid_chs,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+        )
+        self.stem4 = ConvBNAct(
+            mid_chs,
+            out_chs,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+        )
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=1, ceil_mode=True)
+    def forward(self, x):
+        x = self.stem1(x)
+        x = F.pad(x, (0, 1, 0, 1))
+        x2 = self.stem2a(x)
+        x2 = F.pad(x2, (0, 1, 0, 1))
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = torch.cat([x1, x2], dim=1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+        return x
+class EseModule(nn.Module):
+    def __init__(self, chs):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            chs,
+            chs,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        identity = x
+        x = x.mean((2, 3), keepdim=True)
+        x = self.conv(x)
+        x = self.sigmoid(x)
+        return torch.mul(identity, x)
+class HG_Block(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            mid_chs,
+            out_chs,
+            layer_num,
+            kernel_size=3,
+            residual=False,
+            light_block=False,
+            use_lab=False,
+            agg='ese',
+            drop_path=0.,
+    ):
+        super().__init__()
+        self.residual = residual
+        self.layers = nn.ModuleList()
+        for i in range(layer_num):
+            if light_block:
+                self.layers.append(
+                    LightConvBNAct(
+                        in_chs if i == 0 else mid_chs,
+                        mid_chs,
+                        kernel_size=kernel_size,
+                        use_lab=use_lab,
+                    )
+                )
+            else:
+                self.layers.append(
+                    ConvBNAct(
+                        in_chs if i == 0 else mid_chs,
+                        mid_chs,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        use_lab=use_lab,
+                    )
+                )
+        # feature aggregation
+        total_chs = in_chs + layer_num * mid_chs
+        if agg == 'se':
+            aggregation_squeeze_conv = ConvBNAct(
+                total_chs,
+                out_chs // 2,
+                kernel_size=1,
+                stride=1,
+                use_lab=use_lab,
+            )
+            aggregation_excitation_conv = ConvBNAct(
+                out_chs // 2,
+                out_chs,
+                kernel_size=1,
+                stride=1,
+                use_lab=use_lab,
+            )
+            self.aggregation = nn.Sequential(
+                aggregation_squeeze_conv,
+                aggregation_excitation_conv,
+            )
+        else:
+            aggregation_conv = ConvBNAct(
+                total_chs,
+                out_chs,
+                kernel_size=1,
+                stride=1,
+                use_lab=use_lab,
+            )
+            att = EseModule(out_chs)
+            self.aggregation = nn.Sequential(
+                aggregation_conv,
+                att,
+            )
+        self.drop_path = nn.Dropout(drop_path) if drop_path else nn.Identity()
+    def forward(self, x):
+        identity = x
+        output = [x]
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = torch.cat(output, dim=1)
+        x = self.aggregation(x)
+        if self.residual:
+            x = self.drop_path(x) + identity
+        return x
+class HG_Stage(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            mid_chs,
+            out_chs,
+            block_num,
+            layer_num,
+            downsample=True,
+            light_block=False,
+            kernel_size=3,
+            use_lab=False,
+            agg='se',
+            drop_path=0.,
+    ):
+        super().__init__()
+        self.downsample = downsample
+        if downsample:
+            self.downsample = ConvBNAct(
+                in_chs,
+                in_chs,
+                kernel_size=3,
+                stride=2,
+                groups=in_chs,
+                use_act=False,
+                use_lab=use_lab,
+            )
+        else:
+            self.downsample = nn.Identity()
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HG_Block(
+                    in_chs if i == 0 else out_chs,
+                    mid_chs,
+                    out_chs,
+                    layer_num,
+                    residual=False if i == 0 else True,
+                    kernel_size=kernel_size,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    agg=agg,
+                    drop_path=drop_path[i] if isinstance(drop_path, (list, tuple)) else drop_path,
+                )
+            )
+        self.blocks = nn.Sequential(*blocks_list)
+    def forward(self, x):
+        x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+class HGNetv2(nn.Module):
+    """
+    HGNetV2
+    Args:
+        stem_channels: list. Number of channels for the stem block.
+        stage_type: str. The stage configuration of HGNet. such as the number of channels, stride, etc.
+        use_lab: boolean. Whether to use LearnableAffineBlock in network.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific HGNetV2 model depends on args.
+    """
+    arch_configs = {
+        'B0': {
+            'stem_channels': [3, 16, 16],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [16, 16, 64, 1, False, False, 3, 3],
+                "stage2": [64, 32, 256, 1, True, False, 3, 3],
+                "stage3": [256, 64, 512, 2, True, True, 5, 3],
+                "stage4": [512, 128, 1024, 1, True, True, 5, 3],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B0_stage1.pth'
+        },
+        'B1': {
+            'stem_channels': [3, 24, 32],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 64, 1, False, False, 3, 3],
+                "stage2": [64, 48, 256, 1, True, False, 3, 3],
+                "stage3": [256, 96, 512, 2, True, True, 5, 3],
+                "stage4": [512, 192, 1024, 1, True, True, 5, 3],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B1_stage1.pth'
+        },
+        'B2': {
+            'stem_channels': [3, 24, 32],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 96, 1, False, False, 3, 4],
+                "stage2": [96, 64, 384, 1, True, False, 3, 4],
+                "stage3": [384, 128, 768, 3, True, True, 5, 4],
+                "stage4": [768, 256, 1536, 1, True, True, 5, 4],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B2_stage1.pth'
+        },
+        'B3': {
+            'stem_channels': [3, 24, 32],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [32, 32, 128, 1, False, False, 3, 5],
+                "stage2": [128, 64, 512, 1, True, False, 3, 5],
+                "stage3": [512, 128, 1024, 3, True, True, 5, 5],
+                "stage4": [1024, 256, 2048, 1, True, True, 5, 5],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B3_stage1.pth'
+        },
+        'B4': {
+            'stem_channels': [3, 32, 48],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [48, 48, 128, 1, False, False, 3, 6],
+                "stage2": [128, 96, 512, 1, True, False, 3, 6],
+                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B4_stage1.pth'
+        },
+        'B5': {
+            'stem_channels': [3, 32, 64],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [64, 64, 128, 1, False, False, 3, 6],
+                "stage2": [128, 128, 512, 2, True, False, 3, 6],
+                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B5_stage1.pth'
+        },
+        'B6': {
+            'stem_channels': [3, 48, 96],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [96, 96, 192, 2, False, False, 3, 6],
+                "stage2": [192, 192, 512, 3, True, False, 3, 6],
+                "stage3": [512, 384, 1024, 6, True, True, 5, 6],
+                "stage4": [1024, 768, 2048, 3, True, True, 5, 6],
+            },
+            'url': 'https://github.com/Peterande/storage/releases/download/dfinev1.0/PPHGNetV2_B6_stage1.pth'
+        },
+    }
+    def __init__(self,
+                 name,
+                 use_lab=False,
+                 return_idx=[1, 2, 3],
+                 freeze_stem_only=True,
+                 freeze_at=0,
+                 freeze_norm=True,
+                 pretrained=True,
+                 local_model_dir='weight/hgnetv2/',
+                 for_pgi=False):
+        super().__init__()
+        self.use_lab = use_lab
+        self.return_idx = return_idx
+        stem_channels = self.arch_configs[name]['stem_channels']
+        stage_config = self.arch_configs[name]['stage_config']
+        download_url = self.arch_configs[name]['url']
+        self._out_strides = [4, 8, 16, 32]
+        self._out_channels = [stage_config[k][2] for k in stage_config]
+        self.num_channels = self._out_channels[4 - len(return_idx):]
+        # stem
+        self.stem = StemBlock(
+                in_chs=stem_channels[0],
+                mid_chs=stem_channels[1],
+                out_chs=stem_channels[2],
+                use_lab=use_lab)
+        # stages
+        self.stages = nn.ModuleList()
+        for i, k in enumerate(stage_config):
+            in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[k]
+            self.stages.append(
+                HG_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab))
+        if freeze_at >= 0:
+            self._freeze_parameters(self.stem)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, len(self.stages))):
+                    self._freeze_parameters(self.stages[i])
+        if freeze_norm:
+            self._freeze_norm(self)
+        if pretrained:
+            RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
+            try:
+                model_path = local_model_dir + 'PPHGNetV2_' + name + '_stage1.pth'
+                if os.path.exists(model_path):
+                    print("Loading stage1")
+                    state = torch.load(model_path, map_location='cpu')
+                    print(f"Loaded stage1 {name} HGNetV2 from local file.")
+                else:
+                    # If the file doesn't exist locally, download from the URL
+                    if torch.distributed.get_rank() == 0:
+                        print(GREEN + "If the pretrained HGNetV2 can't be downloaded automatically. Please check your network connection." + RESET)
+                        print(GREEN + "Please check your network connection. Or download the model manually from " + RESET + f"{download_url}" + GREEN + " to " + RESET + f"{local_model_dir}." + RESET)
+                        state = torch.hub.load_state_dict_from_url(download_url, map_location='cpu', model_dir=local_model_dir)
+                        torch.distributed.barrier()
+                    else:
+                        torch.distributed.barrier()
+                        state = torch.load(local_model_dir)
+                    print(f"Loaded stage1 {name} HGNetV2 from URL.")
+                self.load_state_dict(state)
+            except (Exception, KeyboardInterrupt) as e:
+                if torch.distributed.get_rank() == 0:
+                    print(f"{str(e)}")
+                    logging.error(RED + "CRITICAL WARNING: Failed to load pretrained HGNetV2 model" + RESET)
+                    logging.error(GREEN + "Please check your network connection. Or download the model manually from " \
+                                + RESET + f"{download_url}" + GREEN + " to " + RESET + f"{local_model_dir}." + RESET)
+                exit()
+    def _freeze_norm(self, m: nn.Module):
+        if isinstance(m, nn.BatchNorm2d):
+            m = FrozenBatchNorm2d(m.num_features)
+        else:
+            for name, child in m.named_children():
+                _child = self._freeze_norm(child)
+                if _child is not child:
+                    setattr(m, name, _child)
+        return m
+    def _freeze_parameters(self, m: nn.Module):
+        for p in m.parameters():
+            p.requires_grad = False
+    def forward(self, x):
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
+def build_hgnetv2(args):
+    name = {
+    'HGNetv2_B0': 'B0',
+    'HGNetv2_B1': 'B1',
+    'HGNetv2_B2': 'B2',
+    'HGNetv2_B3': 'B3',
+    'HGNetv2_B4': 'B4',
+    'HGNetv2_B5': 'B5',
+    'HGNetv2_B6': 'B6'
+    }
+    return HGNetv2(
+        name[args.backbone],
+        return_idx=args.return_interm_indices,
+        freeze_at=-1,
+        freeze_norm=args.freeze_norm,
+        freeze_stem_only= args.freeze_stem_only,
+        use_lab = args.use_lab,
+        pretrained = args.pretrained,
+        )

linea/models/linea/hybrid_encoder.py ADDED Viewed

	@@ -0,0 +1,471 @@

+"""
+Modified from D-FINE (https://github.com/Peterande/D-FINE)
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright (c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import copy
+from typing import Optional
+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+def get_activation(act: str, inpace: bool=True):
+    """get activation
+    """
+    if act is None:
+        return nn.Identity()
+    elif isinstance(act, nn.Module):
+        return act
+    act = act.lower()
+    if act == 'silu' or act == 'swish':
+        m = nn.SiLU()
+    elif act == 'relu':
+        m = nn.ReLU()
+    elif act == 'leaky_relu':
+        m = nn.LeakyReLU()
+    elif act == 'silu':
+        m = nn.SiLU()
+    elif act == 'gelu':
+        m = nn.GELU()
+    elif act == 'hardsigmoid':
+        m = nn.Hardsigmoid()
+    else:
+        raise RuntimeError('')
+    if hasattr(m, 'inplace'):
+        m.inplace = inpace
+    return m
+class ConvNormLayer_fuse(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, g=1, padding=None, bias=False, act=None):
+        super().__init__()
+        padding = (kernel_size-1)//2 if padding is None else padding
+        self.conv = nn.Conv2d(
+            ch_in,
+            ch_out,
+            kernel_size,
+            stride,
+            groups=g,
+            padding=padding,
+            bias=bias)
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = nn.Identity() if act is None else get_activation(act)
+        self.ch_in, self.ch_out, self.kernel_size, self.stride, self.g, self.padding, self.bias = \
+            ch_in, ch_out, kernel_size, stride, g, padding, bias
+    def forward(self, x):
+        if hasattr(self, 'conv_bn_fused'):
+            y = self.conv_bn_fused(x)
+        else:
+            y = self.norm(self.conv(x))
+        return self.act(y)
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv_bn_fused'):
+            self.conv_bn_fused = nn.Conv2d(
+                self.ch_in,
+                self.ch_out,
+                self.kernel_size,
+                self.stride,
+                groups=self.g,
+                padding=self.padding,
+                bias=True)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv_bn_fused.weight.data = kernel
+        self.conv_bn_fused.bias.data = bias
+        self.__delattr__('conv')
+        self.__delattr__('norm')
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor()
+        return kernel3x3, bias3x3
+    def _fuse_bn_tensor(self):
+        kernel = self.conv.weight
+        running_mean = self.norm.running_mean
+        running_var = self.norm.running_var
+        gamma = self.norm.weight
+        beta = self.norm.bias
+        eps = self.norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+class ConvNormLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            ch_in,
+            ch_out,
+            kernel_size,
+            stride,
+            padding=(kernel_size-1)//2 if padding is None else padding,
+            bias=bias)
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = nn.Identity() if act is None else get_activation(act)
+    def forward(self, x):
+        return self.act(self.norm(self.conv(x)))
+class SCDown(nn.Module):
+    def __init__(self, c1, c2, k, s):
+        super().__init__()
+        self.cv1 = ConvNormLayer_fuse(c1, c2, 1, 1)
+        self.cv2 = ConvNormLayer_fuse(c2, c2, k, s, c2)
+    def forward(self, x):
+        return self.cv2(self.cv1(x))
+class VGGBlock(nn.Module):
+    def __init__(self, ch_in, ch_out, act='relu'):
+        super().__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        assert ch_out % 2 == 0
+        self.conv1 = ConvNormLayer(ch_in, ch_out, 3, 1, padding=1, act=None)
+        self.conv2 = ConvNormLayer(ch_in, ch_out, 1, 1, padding=0, act=None)
+        # self.conv1H = ConvNormLayer(ch_in, ch_out//2 , (3, 1), 1, padding=(1, 0), act=None)
+        # self.conv1W = ConvNormLayer(ch_in, ch_out//2, (1, 3), 1, padding=(0, 1), act=None)
+        # self.conv2H = ConvNormLayer(ch_in, ch_out//2, 1, 1, padding=0, act=None)
+        # self.conv2W = ConvNormLayer(ch_in, ch_out//2, 1, 1, padding=0, act=None)
+        # self.conv3 = ConvNormLayer(ch_out, ch_out, 1, 1, padding=0, act=None)
+        self.act = nn.Identity() if act is None else get_activation(act)
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            y = self.conv1(x) + self.conv2(x)
+        return self.act(y)
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias
+        # self.__delattr__('conv1')
+        # self.__delattr__('conv2')
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1), bias3x3 + bias1x1
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return F.pad(kernel1x1, [1, 1, 1, 1])
+    def _fuse_bn_tensor(self, branch: ConvNormLayer):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.norm.running_mean
+        running_var = branch.norm.running_var
+        gamma = branch.norm.weight
+        beta = branch.norm.bias
+        eps = branch.norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+class RepNCSPELAN4(nn.Module):
+    # csp-elan
+    def __init__(self, c1, c2, c3, c4, n=3,
+                 bias=False,
+                 act="silu"):
+        super().__init__()
+        self.c = c3//2
+        self.cv1 = ConvNormLayer_fuse(c1, c3, 1, 1, bias=bias, act=act)
+        self.cv2 = nn.Sequential(CSPLayer(c3//2, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act))
+        self.cv3 = nn.Sequential(CSPLayer(c4, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act))
+        self.cv4 = ConvNormLayer_fuse(c3+(2*c4), c2, 1, 1, bias=bias, act=act)
+    def forward_chunk(self, x):
+        y = list(self.cv1(x).chunk(2, 1))
+        y.extend((m(y[-1])) for m in [self.cv2, self.cv3])
+        return self.cv4(torch.cat(y, 1))
+    def forward(self, x):
+        y = list(self.cv1(x).split((self.c, self.c), 1))
+        y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
+        return self.cv4(torch.cat(y, 1))
+class CSPLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=3,
+                 expansion=1.0,
+                 bias=None,
+                 act="silu",
+                 bottletype=VGGBlock):
+        super(CSPLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = ConvNormLayer(in_channels, hidden_channels, 1, 1, bias=bias, act=act)
+        self.conv2 = ConvNormLayer(in_channels, hidden_channels, 1, 1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(*[
+            bottletype(hidden_channels, hidden_channels, act=act) for _ in range(num_blocks)
+        ])
+        if hidden_channels != out_channels:
+            self.conv3 = ConvNormLayer(hidden_channels, out_channels, 1, 1, bias=bias, act=act)
+        else:
+            self.conv3 = nn.Identity()
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        return self.conv3(x_1 + x_2)
+# transformer
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 normalize_before=False):
+        super().__init__()
+        self.normalize_before = normalize_before
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout, batch_first=True)
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = get_activation(activation)
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+    def forward(self,
+        src,
+        src_mask=None,
+        src_key_padding_mask=None,
+        pos_embed=None) -> torch.Tensor:
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src, _ = self.self_attn(q, k,
+            value=src,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask)
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+class TransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+    def forward(self,
+        src,
+        src_mask=None,
+        src_key_padding_mask=None,
+        pos_embed=None) -> torch.Tensor:
+        output = src
+        for layer in self.layers:
+            output = layer(output,
+                src_mask=src_mask,
+                src_key_padding_mask=src_key_padding_mask,
+                pos_embed=pos_embed)
+        if self.norm is not None:
+            output = self.norm(output)
+        return output
+class HybridEncoder(nn.Module):
+    def __init__(self,
+                 n_levels=3,
+                 hidden_dim=256,
+                 nhead=8,
+                 dim_feedforward = 1024,
+                 dropout=0.0,
+                 enc_act='gelu',
+                 use_encoder_idx=[2],
+                 num_encoder_layers=1,
+                 expansion=1.0,
+                 depth_mult=1.0,
+                 act='silu',
+                 eval_spatial_size=None
+        ):
+        super().__init__()
+        self.n_levels = n_levels
+        self.hidden_dim = hidden_dim
+        self.use_encoder_idx = use_encoder_idx
+        self.num_encoder_layers = num_encoder_layers
+        self.eval_spatial_size = eval_spatial_size
+        # encoder transformer
+        encoder_layer = TransformerEncoderLayer(
+            hidden_dim,
+            nhead=nhead,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            activation=enc_act)
+        self.encoder = TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers)
+        # top-down fpn
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_blocks = nn.ModuleList()
+        for _ in range(n_levels - 1, 0, -1):
+            self.lateral_convs.append(ConvNormLayer(hidden_dim, hidden_dim, 1, 1, act=act))
+            self.fpn_blocks.append(
+                RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(expansion * hidden_dim // 2), round(3 * depth_mult))
+                # CSPRepLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion)
+            )
+        # bottom-up pan
+        self.downsample_convs = nn.ModuleList()
+        self.pan_blocks = nn.ModuleList()
+        for _ in range(n_levels - 1):
+            self.downsample_convs.append(nn.Sequential(
+                SCDown(hidden_dim, hidden_dim, 3, 2),
+                )
+            )
+            self.pan_blocks.append(
+                RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(expansion * hidden_dim // 2), round(3 * depth_mult))
+                # CSPRepLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion)
+            )
+    #     self._reset_parameters()
+    # def _reset_parameters(self):
+    #     if self.eval_spatial_size:
+    #         for idx in self.use_encoder_idx:
+    #             stride = self.feat_strides[idx]
+    #             pos_embed = self.build_2d_sincos_position_embedding(
+    #                 self.eval_spatial_size[1] // stride, self.eval_spatial_size[0] // stride,
+    #                 self.hidden_dim, self.pe_temperature)
+    #             setattr(self, f'pos_embed{idx}', pos_embed)
+    #             # self.register_buffer(f'pos_embed{idx}', pos_embed)
+    def forward(self,
+            src: Tensor,
+            pos: Tensor,
+            spatial_shapes: Tensor,
+            level_start_index: Tensor,
+            valid_ratios: Tensor,
+            key_padding_mask: Tensor,
+            ref_token_index: Optional[Tensor]=None,
+            ref_token_coord: Optional[Tensor]=None
+            ):
+        """
+        Input:
+            - src: [bs, sum(hi*wi), 256]
+            - pos: pos embed for src. [bs, sum(hi*wi), 256]
+            - spatial_shapes: h,w of each level [num_level, 2]
+            - level_start_index: [num_level] start point of level in sum(hi*wi).
+            - valid_ratios: [bs, num_level, 2]
+            - key_padding_mask: [bs, sum(hi*wi)]
+            - ref_token_index: bs, nq
+            - ref_token_coord: bs, nq, 4
+        Intermedia:
+            - reference_points: [bs, sum(hi*wi), num_level, 2]
+        Outpus:
+            - output: [bs, sum(hi*wi), 256]
+        """
+        src_list = src.split([H_ * W_ for H_, W_ in spatial_shapes], dim=1)
+        pos_ = pos[:, level_start_index[-1]:]
+        key_padding_mask_ = key_padding_mask[:, level_start_index[-1]:]
+        memory = self.encoder(src_list[-1], pos_embed=pos_, src_key_padding_mask=key_padding_mask_)
+        c = src.size(2)
+        proj_feats = []
+        for i, (H, W) in enumerate(spatial_shapes):
+            if i == len(spatial_shapes) - 1:
+                proj_feats.append(memory.reshape(-1, H, W, c).permute(0, 3, 1, 2))
+                continue
+            proj_feats.append(src_list[i].reshape(-1, H, W, c).permute(0, 3, 1, 2))
+        # broadcasting and fusion
+        inner_outs = [proj_feats[-1]]
+        for idx in range(self.n_levels - 1, 0, -1):
+            feat_high = inner_outs[0]
+            feat_low = proj_feats[idx - 1]
+            feat_high = self.lateral_convs[self.n_levels - 1 - idx](feat_high)
+            inner_outs[0] = feat_high
+            upsample_feat = F.interpolate(feat_high, scale_factor=2., mode='nearest')
+            inner_out = self.fpn_blocks[self.n_levels-1-idx](torch.concat([upsample_feat, feat_low], dim=1))
+            inner_outs.insert(0, inner_out)
+        outs = [inner_outs[0]]
+        for idx in range(self.n_levels - 1):
+            feat_low = outs[-1]
+            feat_high = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            out = self.pan_blocks[idx](torch.concat([downsample_feat, feat_high], dim=1))
+            outs.append(out)
+        for i in range(len(outs)):
+            outs[i] = outs[i].flatten(2).permute(0, 2, 1)
+        return torch.cat(outs, dim=1), None, None
+def build_hybrid_encoder(args):
+    return HybridEncoder(
+        n_levels=args.num_feature_levels,
+        hidden_dim=args.hidden_dim,
+        nhead=args.nheads,
+        dim_feedforward = args.dim_feedforward,
+        dropout=args.dropout,
+        enc_act='gelu',
+        # pe_temperature=10000,
+        expansion=args.expansion,
+        depth_mult=args.depth_mult,
+        act='silu',
+        )

linea/models/linea/hybrid_encoder_asymmetric_conv.py ADDED Viewed

	@@ -0,0 +1,549 @@

+"""
+Modified from D-FINE (https://github.com/Peterande/D-FINE)
+Copyright (c) 2024 The D-FINE Authors. All Rights Reserved.
+---------------------------------------------------------------------------------
+Modified from RT-DETR (https://github.com/lyuwenyu/RT-DETR)
+Copyright (c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import copy
+import math
+from typing import Optional
+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+def get_activation(act: str, inpace: bool=True):
+    """get activation
+    """
+    if act is None:
+        return nn.Identity()
+    elif isinstance(act, nn.Module):
+        return act
+    act = act.lower()
+    if act == 'silu' or act == 'swish':
+        m = nn.SiLU()
+    elif act == 'relu':
+        m = nn.ReLU()
+    elif act == 'leaky_relu':
+        m = nn.LeakyReLU()
+    elif act == 'silu':
+        m = nn.SiLU()
+    elif act == 'gelu':
+        m = nn.GELU()
+    elif act == 'hardsigmoid':
+        m = nn.Hardsigmoid()
+    else:
+        raise RuntimeError('')
+    if hasattr(m, 'inplace'):
+        m.inplace = inpace
+    return m
+class CBLinear(nn.Module):
+    def __init__(self, c1, c2s, k=1, s=1, p=None, g=1):  # ch_in, ch_outs, kernel, stride, padding, groups
+        super(CBLinear, self).__init__()
+        self.c2s = c2s
+        self.conv = nn.Conv2d(c1, sum(c2s), k, s, 0, groups=g, bias=True)
+    def forward(self, x):
+        outs = self.conv(x).split(self.c2s, dim=1)
+        return outs
+class CBFuse(nn.Module):
+    def __init__(self, idx):
+        super(CBFuse, self).__init__()
+        self.idx = idx
+    def forward(self, xs):
+        target_size = xs[-1].shape[2:]
+        res = [F.interpolate(x[self.idx[i]], size=target_size, mode='nearest') for i, x in enumerate(xs[:-1])]
+        out = torch.sum(torch.stack(res + xs[-1:]), dim=0)
+        return out
+class ConvNormLayer_fuse(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, g=1, padding=None, bias=False, act=None):
+        super().__init__()
+        padding = (kernel_size-1)//2 if padding is None else padding
+        self.conv = nn.Conv2d(
+            ch_in,
+            ch_out,
+            kernel_size,
+            stride,
+            groups=g,
+            padding=padding,
+            bias=bias)
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = nn.Identity() if act is None else get_activation(act)
+        self.ch_in, self.ch_out, self.kernel_size, self.stride, self.g, self.padding, self.bias = \
+            ch_in, ch_out, kernel_size, stride, g, padding, bias
+    def forward(self, x):
+        if hasattr(self, 'conv_bn_fused'):
+            y = self.conv_bn_fused(x)
+        else:
+            y = self.norm(self.conv(x))
+        return self.act(y)
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv_bn_fused'):
+            self.conv_bn_fused = nn.Conv2d(
+                self.ch_in,
+                self.ch_out,
+                self.kernel_size,
+                self.stride,
+                groups=self.g,
+                padding=self.padding,
+                bias=True)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv_bn_fused.weight.data = kernel
+        self.conv_bn_fused.bias.data = bias
+        self.__delattr__('conv')
+        self.__delattr__('norm')
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor()
+        return kernel3x3, bias3x3
+    def _fuse_bn_tensor(self):
+        kernel = self.conv.weight
+        running_mean = self.norm.running_mean
+        running_var = self.norm.running_var
+        gamma = self.norm.weight
+        beta = self.norm.bias
+        eps = self.norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+class ConvNormLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            ch_in,
+            ch_out,
+            kernel_size,
+            stride,
+            padding=(kernel_size-1)//2 if padding is None else padding,
+            bias=bias)
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = nn.Identity() if act is None else get_activation(act)
+    def forward(self, x):
+        return self.act(self.norm(self.conv(x)))
+class SCDown(nn.Module):
+    def __init__(self, c1, c2, k, s):
+        super().__init__()
+        self.cv1 = ConvNormLayer_fuse(c1, c2, 1, 1)
+        self.cv2 = ConvNormLayer_fuse(c2, c2, k, s, c2)
+    def forward(self, x):
+        return self.cv2(self.cv1(x))
+class VGGBlock(nn.Module):
+    def __init__(self, ch_in, ch_out, act='relu'):
+        super().__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.convH = ConvNormLayer(ch_in, ch_out, (3, 1), 1, padding=(1, 0), act=None)
+        self.convW = ConvNormLayer(ch_in, ch_out, (1, 3), 1, padding=(0, 1), act=None)
+        self.conv1 = ConvNormLayer(ch_in, ch_out, 3, 1, padding=1, act=None)
+        self.conv2 = ConvNormLayer(ch_in, ch_out, 1, 1, padding=0, act=None)
+        self.act = nn.Identity() if act is None else get_activation(act)
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            y_vertical = self.convH(x)
+            y_horizontal = self.convW(x)
+            y = self.conv1(x) + self.conv2(x) + y_horizontal + y_vertical
+        return self.act(y)
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias
+        self.__delattr__('conv1')
+        self.__delattr__('conv2')
+        self.__delattr__('convH')
+        self.__delattr__('convW')
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        kernel3x1, bias3x1 = self._fuse_bn_tensor(self.convH)
+        kernel1x3, bias1x3 = self._fuse_bn_tensor(self.convW)
+        kernel = kernel3x3  + self._pad_1x1_to_3x3_tensor(kernel1x1) + self._pad_1x1_to_3x3_tensor(kernel3x1, 'Vertical') + self._pad_1x1_to_3x3_tensor(kernel1x3, 'Horizontal')
+        bias = bias3x3 + bias1x1 + bias3x1 + bias1x3
+        return kernel, bias
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1, assymetric='None'):
+        if kernel1x1 is None:
+            return 0
+        else:
+            if assymetric == 'None':
+                return F.pad(kernel1x1, [1, 1, 1, 1])
+            elif assymetric == 'Vertical':
+                return F.pad(kernel1x1, [1, 1, 0, 0])
+            elif assymetric == 'Horizontal':
+                return F.pad(kernel1x1, [0, 0, 1, 1])
+    def _fuse_bn_tensor(self, branch: ConvNormLayer):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.norm.running_mean
+        running_var = branch.norm.running_var
+        gamma = branch.norm.weight
+        beta = branch.norm.bias
+        eps = branch.norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+class RepNCSPELAN4(nn.Module):
+    # csp-elan
+    def __init__(self, c1, c2, c3, c4, n=3,
+                 bias=False,
+                 act="silu"):
+        super().__init__()
+        self.c = c3//2
+        self.cv1 = ConvNormLayer_fuse(c1, c3, 1, 1, bias=bias, act=act)
+        self.cv2 = nn.Sequential(CSPLayer(c3//2, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act))
+        self.cv3 = nn.Sequential(CSPLayer(c4, c4, n, 1, bias=bias, act=act, bottletype=VGGBlock), ConvNormLayer_fuse(c4, c4, 3, 1, bias=bias, act=act))
+        self.cv4 = ConvNormLayer_fuse(c3+(2*c4), c2, 1, 1, bias=bias, act=act)
+    def forward_chunk(self, x):
+        y = list(self.cv1(x).chunk(2, 1))
+        y.extend((m(y[-1])) for m in [self.cv2, self.cv3])
+        return self.cv4(torch.cat(y, 1))
+    def forward(self, x):
+        y = list(self.cv1(x).split((self.c, self.c), 1))
+        y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
+        return self.cv4(torch.cat(y, 1))
+class CSPLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=3,
+                 expansion=1.0,
+                 bias=None,
+                 act="silu",
+                 bottletype=VGGBlock):
+        super(CSPLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = ConvNormLayer(in_channels, hidden_channels, 1, 1, bias=bias, act=act)
+        self.conv2 = ConvNormLayer(in_channels, hidden_channels, 1, 1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(*[
+            bottletype(hidden_channels, hidden_channels, act=act) for _ in range(num_blocks)
+        ])
+        if hidden_channels != out_channels:
+            self.conv3 = ConvNormLayer(hidden_channels, out_channels, 1, 1, bias=bias, act=act)
+        else:
+            self.conv3 = nn.Identity()
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        return self.conv3(x_1 + x_2)
+# transformer
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 normalize_before=False):
+        super().__init__()
+        self.normalize_before = normalize_before
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout, batch_first=True)
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = get_activation(activation)
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+    def forward(self,
+        src,
+        src_mask=None,
+        src_key_padding_mask=None,
+        pos_embed=None) -> torch.Tensor:
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src, _ = self.self_attn(q, k,
+            value=src,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask)
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+class TransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+    def forward(self,
+        src,
+        src_mask=None,
+        src_key_padding_mask=None,
+        pos_embed=None) -> torch.Tensor:
+        output = src
+        for layer in self.layers:
+            output = layer(output,
+                src_mask=src_mask,
+                src_key_padding_mask=src_key_padding_mask,
+                pos_embed=pos_embed)
+        if self.norm is not None:
+            output = self.norm(output)
+        return output
+class HybridEncoderAsymConv(nn.Module):
+    def __init__(
+            self,
+            in_channels=[512, 1024, 2048],
+            feat_strides=[8, 16, 32],
+            n_levels=3,
+            hidden_dim=256,
+            nhead=8,
+            dim_feedforward = 1024,
+            dropout=0.0,
+            enc_act='gelu',
+            use_encoder_idx=[2],
+            num_encoder_layers=1,
+            expansion=1.0,
+            depth_mult=1.0,
+            act='silu',
+            eval_spatial_size=None,
+            # position embedding
+            temperatureH=20,
+            temperatureW=20,
+        ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.feat_strides = feat_strides
+        self.n_levels = n_levels
+        self.hidden_dim = hidden_dim
+        self.use_encoder_idx = use_encoder_idx
+        self.num_encoder_layers = num_encoder_layers
+        self.eval_spatial_size = eval_spatial_size
+        self.temperatureW = temperatureW
+        self.temperatureH = temperatureH
+        # channel projection
+        input_proj_list = []
+        for in_channel in in_channels:
+            input_proj_list.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False),
+                    nn.GroupNorm(32, hidden_dim)
+                )
+            )
+        self.input_proj = nn.ModuleList(input_proj_list)
+        # encoder transformer
+        encoder_layer = TransformerEncoderLayer(
+            hidden_dim,
+            nhead=nhead,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            activation=enc_act)
+        self.encoder = nn.ModuleList([
+            TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers) for _ in range(len(use_encoder_idx))
+        ])
+        # top-down fpn
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_blocks = nn.ModuleList()
+        for _ in range(n_levels - 1, 0, -1):
+            self.lateral_convs.append(ConvNormLayer(hidden_dim, hidden_dim, 1, 1, act=act))
+            self.fpn_blocks.append(
+                RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(expansion * hidden_dim // 2), round(3 * depth_mult))
+                # CSPRepLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion)
+            )
+        # bottom-up pan
+        self.downsample_convs = nn.ModuleList()
+        self.pan_blocks = nn.ModuleList()
+        for _ in range(n_levels - 1):
+            self.downsample_convs.append(nn.Sequential(
+                SCDown(hidden_dim, hidden_dim, 3, 2),
+                )
+            )
+            self.pan_blocks.append(
+                RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(expansion * hidden_dim // 2), round(3 * depth_mult))
+                # CSPRepLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion)
+            )
+        self._reset_parameters()
+    def _reset_parameters(self):
+        # init input_proj
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            # nn.init.constant_(proj[0].bias, 0)
+        if self.eval_spatial_size is not None:
+            for idx in self.use_encoder_idx:
+                stride = self.feat_strides[idx]
+                pos_embed = self.create_sinehw_position_embedding(
+                self.eval_spatial_size[1] // stride,
+                self.eval_spatial_size[0] // stride,
+                self.hidden_dim // 2
+                )
+                setattr(self, f'pos_embed{idx}', pos_embed)
+    def create_sinehw_position_embedding(self, w, h, hidden_dim, scale=None, device='cpu'):
+        """
+        """
+        grid_w = torch.arange(1, int(w)+1, dtype=torch.float32, device=device)
+        grid_h = torch.arange(1, int(h)+1, dtype=torch.float32, device=device)
+        grid_h, grid_w = torch.meshgrid(grid_h, grid_w, indexing='ij')
+        if scale is None:
+            scale = 2 * math.pi
+        eps = 1e-6
+        grid_w = grid_w / (int(w) + eps) * scale
+        grid_h = grid_h / (int(h) + eps) * scale
+        dim_tx = torch.arange(hidden_dim, dtype=torch.float32, device=device)
+        dim_tx = self.temperatureW ** (2 * (dim_tx // 2) / hidden_dim)
+        pos_x = grid_w[..., None] / dim_tx
+        dim_ty = torch.arange(hidden_dim, dtype=torch.float32, device=device)
+        dim_ty = self.temperatureH ** (2 * (dim_ty // 2) / hidden_dim)
+        pos_y = grid_h[..., None] / dim_ty
+        pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+        pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+        pos = torch.cat((pos_y, pos_x), dim=2).permute(2, 0, 1)
+        pos = pos[None].flatten(2).permute(0, 2, 1).contiguous()
+        return pos
+    def forward(self, feats):
+        """
+        Input:
+            - feats: List of features from the backbone
+        Outpus:
+            - output: List of enhanced features
+        """
+        assert len(feats) == len(self.in_channels)
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        # encoder
+        for i, enc_idx in enumerate(self.use_encoder_idx):
+            N_, C_, H_, W_ = proj_feats[enc_idx].shape
+            # flatten [B, C, H, W] to [B, HxW, C]
+            src_flatten = proj_feats[enc_idx].flatten(2).permute(0, 2, 1)
+            if self.training or self.eval_spatial_size is None:
+                pos_embed = self.create_sinehw_position_embedding(
+                    H_, W_, self.hidden_dim//2, device=src_flatten.device)
+            else:
+                pos_embed = getattr(self, f'pos_embed{enc_idx}', None).to(src_flatten.device)
+            proj_feats[enc_idx] = self.encoder[i](
+                src_flatten,
+                pos_embed=pos_embed
+                ).permute(0, 2, 1).reshape(N_, C_, H_, W_).contiguous()
+        # broadcasting and fusion
+        inner_outs = [proj_feats[-1]]
+        for idx in range(self.n_levels - 1, 0, -1):
+            feat_high = inner_outs[0]
+            feat_low = proj_feats[idx - 1]
+            feat_high = self.lateral_convs[self.n_levels - 1 - idx](feat_high)
+            inner_outs[0] = feat_high
+            upsample_feat = F.interpolate(feat_high, scale_factor=2., mode='nearest')
+            inner_out = self.fpn_blocks[self.n_levels-1-idx](torch.concat([upsample_feat, feat_low], dim=1))
+            inner_outs.insert(0, inner_out)
+        outs = [inner_outs[0]]
+        for idx in range(self.n_levels - 1):
+            feat_low = outs[-1]
+            feat_high = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            out = self.pan_blocks[idx](torch.concat([downsample_feat, feat_high], dim=1))
+            outs.append(out)
+        return outs
+def build_hybrid_encoder_with_asymmetric_conv(args):
+    return HybridEncoderAsymConv(
+        in_channels=args.in_channels_encoder,
+        feat_strides=args.feat_strides,
+        n_levels=args.num_feature_levels,
+        hidden_dim=args.hidden_dim,
+        nhead=args.nheads,
+        dim_feedforward = args.dim_feedforward,
+        dropout=args.dropout,
+        enc_act='gelu',
+        expansion=args.expansion,
+        depth_mult=args.depth_mult,
+        act='silu',
+        temperatureH=args.pe_temperatureH,
+        temperatureW=args.pe_temperatureW,
+        eval_spatial_size= args.eval_spatial_size,
+        )

linea/models/linea/linea.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR model and criterion classes.
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# ------------------------------------------------------------------------
+import copy
+import math
+from typing import List
+import torch
+from torch import nn
+from torchvision.transforms.functional import resize
+import numpy as np
+from .utils import sigmoid_focal_loss, MLP
+from ..registry import MODULE_BUILD_FUNCS
+from .hgnetv2 import build_hgnetv2
+from .hybrid_encoder_asymmetric_conv import build_hybrid_encoder_with_asymmetric_conv
+from .decoder import build_decoder
+from .linea_utils import *
+class LINEA(nn.Module):
+    """ This is the Cross-Attention Detector module that performs object detection """
+    def __init__(self,
+        backbone,
+        encoder,
+        decoder,
+        # multiscale = None,
+        use_lmap = False
+        ):
+        """ Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_classes: number of object classes
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         Conditional DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.encoder = encoder
+        self.decoder = decoder
+        # for auxiliary branch
+        if use_lmap:
+            self.aux_branch = nn.ModuleList()
+            hidden_dim = encoder.hidden_dim
+            for i in range(3):
+                n = 2 ** i
+                self.aux_branch.append(nn.Conv2d(hidden_dim, 1, 1))
+    def forward(self, samples, targets:List=None):
+        """ The forward expects a NestedTensor, which consists of:
+               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+            It returns a dict with the following elements:
+               - "pred_logits": the classification logits (including no-object) for all queries.
+                                Shape= [batch_size x num_queries x num_classes]
+               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                               (center_x, center_y, width, height). These values are normalized in [0, 1],
+                               relative to the size of each individual image (disregarding possible padding).
+                               See PostProcess for information on how to retrieve the unnormalized bounding box.
+               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                                dictionnaries containing the two above keys for each decoder layer.
+        """
+        features = self.backbone(samples)
+        features = self.encoder(features)
+        out = self.decoder(features, targets)
+        if self.training and hasattr(self, 'aux_branch'):
+            lmaps = []
+            for feat, convs in zip(features, self.aux_branch):
+                lmap = convs(feat)
+                lmaps.append(lmap)
+            # lmaps = torch.cat(lmaps, dim=1)
+            out['aux_lmap'] = lmaps
+        return out
+    def deploy(self, ):
+        self.eval()
+        for m in self.modules():
+            if hasattr(m, 'convert_to_deploy'):
+                m.convert_to_deploy()
+        return self
+class PostProcess(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+    def __init__(self) -> None:
+        super().__init__()
+        self.deploy_mode = False
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+        """
+        out_logits, out_line = outputs['pred_logits'], outputs['pred_lines']
+        scores = out_logits[..., 0].sigmoid()
+        # convert to [x0, y0, x1, y1] format
+        lines = out_line * target_sizes.repeat(1, 2).unsqueeze(1)
+        if self.deploy_mode:
+            return lines, scores
+        results = [{'lines': l, 'scores': s} for s, l in zip(scores, lines)]
+        return results
+    def deploy(self, ):
+        self.eval()
+        self.deploy_mode = True
+        return self
+@MODULE_BUILD_FUNCS.registe_with_name(module_name='LINEA')
+def build_linea(args):
+    num_classes = args.num_classes
+    backbone = build_hgnetv2(args)
+    encoder = build_hybrid_encoder_with_asymmetric_conv(args)
+    decoder = build_decoder(args)
+    model = LINEA(
+        backbone,
+        encoder,
+        decoder,
+        use_lmap=args.use_lmap
+    )
+    postprocessors = PostProcess()
+    return model, postprocessors

linea/models/linea/linea_utils.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import torch
+def weighting_function(reg_max, up, reg_scale, deploy=False):
+    """
+    Generates the non-uniform Weighting Function W(n) for bounding box regression.
+    Args:
+        reg_max (int): Max number of the discrete bins.
+        up (Tensor): Controls upper bounds of the sequence,
+                     where maximum offset is ±up * H / W.
+        reg_scale (float): Controls the curvature of the Weighting Function.
+                           Larger values result in flatter weights near the central axis W(reg_max/2)=0
+                           and steeper weights at both ends.
+        deploy (bool): If True, uses deployment mode settings.
+    Returns:
+        Tensor: Sequence of Weighting Function.
+    """
+    if deploy:
+        upper_bound1 = (abs(up[0]) * abs(reg_scale)).item()
+        upper_bound2 = (abs(up[0]) * abs(reg_scale) * 2).item()
+        step = (upper_bound1 + 1) ** (2 / (reg_max - 2))
+        left_values = [-(step) ** i + 1 for i in range(reg_max // 2 - 1, 0, -1)]
+        right_values = [(step) ** i - 1 for i in range(1, reg_max // 2)]
+        values = [-upper_bound2] + left_values + [torch.zeros_like(up[0][None])] + right_values + [upper_bound2]
+        return torch.tensor(values, dtype=up.dtype, device=up.device)
+    else:
+        upper_bound1 = abs(up[0]) * abs(reg_scale)
+        upper_bound2 = abs(up[0]) * abs(reg_scale) * 2
+        step = (upper_bound1 + 1) ** (2 / (reg_max - 2))
+        left_values = [-(step) ** i + 1 for i in range(reg_max // 2 - 1, 0, -1)]
+        right_values = [(step) ** i - 1 for i in range(1, reg_max // 2)]
+        values = [-upper_bound2] + left_values + [torch.zeros_like(up[0][None])] + right_values + [upper_bound2]
+        return torch.cat(values, 0)
+def translate_gt(gt, reg_max, reg_scale, up):
+    """
+    Decodes bounding box ground truth (GT) values into distribution-based GT representations.
+    This function maps continuous GT values into discrete distribution bins, which can be used
+    for regression tasks in object detection models. It calculates the indices of the closest
+    bins to each GT value and assigns interpolation weights to these bins based on their proximity
+    to the GT value.
+    Args:
+        gt (Tensor): Ground truth bounding box values, shape (N, ).
+        reg_max (int): Maximum number of discrete bins for the distribution.
+        reg_scale (float): Controls the curvature of the Weighting Function.
+        up (Tensor): Controls the upper bounds of the Weighting Function.
+    Returns:
+        Tuple[Tensor, Tensor, Tensor]:
+            - indices (Tensor): Index of the left bin closest to each GT value, shape (N, ).
+            - weight_right (Tensor): Weight assigned to the right bin, shape (N, ).
+            - weight_left (Tensor): Weight assigned to the left bin, shape (N, ).
+    """
+    gt = gt.reshape(-1)
+    function_values = weighting_function(reg_max, up, reg_scale)
+    # Find the closest left-side indices for each value
+    diffs = function_values.unsqueeze(0) - gt.unsqueeze(1)
+    mask = diffs <= 0
+    closest_left_indices = torch.sum(mask, dim=1) - 1
+    # Calculate the weights for the interpolation
+    indices = closest_left_indices.float()
+    weight_right = torch.zeros_like(indices)
+    weight_left = torch.zeros_like(indices)
+    valid_idx_mask = (indices >= 0) & (indices < reg_max)
+    valid_indices = indices[valid_idx_mask].long()
+    # Obtain distances
+    left_values = function_values[valid_indices]
+    right_values = function_values[valid_indices + 1]
+    left_diffs = torch.abs(gt[valid_idx_mask] - left_values)
+    right_diffs = torch.abs(right_values - gt[valid_idx_mask])
+    # Valid weights
+    weight_right[valid_idx_mask] = left_diffs / (left_diffs + right_diffs)
+    weight_left[valid_idx_mask] = 1.0 - weight_right[valid_idx_mask]
+    # Invalid weights (out of range)
+    invalid_idx_mask_neg = (indices < 0)
+    weight_right[invalid_idx_mask_neg] = 0.0
+    weight_left[invalid_idx_mask_neg] = 1.0
+    indices[invalid_idx_mask_neg] = 0.0
+    invalid_idx_mask_pos = (indices >= reg_max)
+    weight_right[invalid_idx_mask_pos] = 1.0
+    weight_left[invalid_idx_mask_pos] = 0.0
+    indices[invalid_idx_mask_pos] = reg_max - 0.1
+    return indices, weight_right, weight_left
+def bbox2distance(points, bbox, reg_max, reg_scale, up, eps=0.1):
+    """
+    Converts bounding box coordinates to distances from a reference point.
+    Args:
+        points (Tensor): (n, 4) [x, y, w, h], where (x, y) is the center.
+        bbox (Tensor): (n, 4) bounding boxes in "xyxy" format.
+        reg_max (float): Maximum bin value.
+        reg_scale (float): Controling curvarture of W(n).
+        up (Tensor): Controling upper bounds of W(n).
+        eps (float): Small value to ensure target < reg_max.
+    Returns:
+        Tensor: Decoded distances.
+    """
+    reg_scale = abs(reg_scale)
+    Dx = torch.abs(points[..., 0] - points[..., 2])
+    Dy = torch.abs(points[..., 1] - points[..., 3])
+    left   = (points[:, 0] - bbox[:, 0]) / (Dx / reg_scale + 1e-16) - 0.5 * reg_scale
+    top    = (points[:, 1] - bbox[:, 1]) / (Dy / reg_scale + 1e-16) - 0.5 * reg_scale
+    right  = (points[:, 2] - bbox[:, 2]) / (Dx / reg_scale + 1e-16) - 0.5 * reg_scale
+    bottom = (points[:, 3] - bbox[:, 3]) / (Dy / reg_scale + 1e-16) - 0.5 * reg_scale
+    four_lens = torch.stack([left, top, right, bottom], -1)
+    four_lens, weight_right, weight_left = translate_gt(four_lens, reg_max, reg_scale, up)
+    if reg_max is not None:
+        four_lens = four_lens.clamp(min=0, max=reg_max-eps)
+    return four_lens.reshape(-1).detach(), weight_right.detach(), weight_left.detach()
+def distance2bbox(points, distance, reg_scale):
+    """
+    Decodes edge-distances into bounding box coordinates.
+    Args:
+        points (Tensor): (B, N, 4) or (N, 4) format, representing [x, y, w, h],
+                         where (x, y) is the center and (w, h) are width and height.
+        distance (Tensor): (B, N, 4) or (N, 4), representing distances from the
+                           point to the left, top, right, and bottom boundaries.
+        reg_scale (float): Controls the curvature of the Weighting Function.
+    Returns:
+        Tensor: Bounding boxes in (N, 4) or (B, N, 4) format [cx, cy, w, h].
+    """
+    reg_scale = abs(reg_scale)
+    Dx = torch.abs(points[..., 0] - points[..., 2])
+    Dy = torch.abs(points[..., 1] - points[..., 3])
+    x1 = points[..., 0] + (0.5 * reg_scale + distance[..., 0]) * (Dx / reg_scale)
+    y1 = points[..., 1] + (0.5 * reg_scale + distance[..., 1]) * (Dy / reg_scale)
+    x2 = points[..., 2] + (0.5 * reg_scale + distance[..., 2]) * (Dx / reg_scale)
+    y2 = points[..., 3] + (0.5 * reg_scale + distance[..., 3]) * (Dy / reg_scale)
+    bboxes = torch.stack([x1, y1, x2, y2], -1)
+    return bboxes
+def inverse_sigmoid(x, eps=1e-3):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1/x2)

linea/models/linea/matcher.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modules to compute the matching cost and solve the corresponding LSAP.
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# ------------------------------------------------------------------------
+import torch, os
+from torch import nn
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, focal_alpha = 0.25):
+        """Creates the matcher
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_line = cost_bbox
+        assert cost_class != 0 or cost_bbox != 0, "all costs cant be 0"
+        self.focal_alpha = focal_alpha
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """ Performs the matching
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes]
+        out_line = outputs["pred_lines"].flatten(0, 1)  # [batch_size * num_queries, 4]
+        # Also concat the target labels and lines
+        tgt_ids = torch.cat([v["labels"] for v in targets])
+        tgt_line = torch.cat([v["lines"] for v in targets])
+        # Compute the classification cost.
+        alpha = self.focal_alpha
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+        # Compute the L1 cost between boxes
+        cost_line = torch.cdist(out_line, tgt_line, p=1)
+        # Final cost matrix
+        C = self.cost_line * cost_line + self.cost_class * cost_class
+        C = C.view(bs, num_queries, -1).cpu()
+        sizes = [len(v["lines"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+class SimpleMinsumMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, focal_alpha = 0.25):
+        """Creates the matcher
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_line = cost_bbox
+        assert cost_class != 0 or cost_bbox != 0, "all costs cant be 0"
+        self.focal_alpha = focal_alpha
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """ Performs the matching
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_line = outputs["pred_lines"].flatten(0, 1)  # [batch_size * num_queries, 4]
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["labels"] for v in targets])
+        tgt_line = torch.cat([v["lines"] for v in targets])
+        # Compute the classification cost.
+        alpha = self.focal_alpha
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+        # Compute the L1 cost between boxes
+        cost_line = torch.cdist(out_line, tgt_line, p=1)
+        # Final cost matrix
+        C = self.cost_line * cost_line + self.cost_class * cost_class
+        C = C.view(bs, num_queries, -1)
+        sizes = [len(v["lines"]) for v in targets]
+        indices = []
+        device = C.device
+        for i, (c, _size) in enumerate(zip(C.split(sizes, -1), sizes)):
+            weight_mat = c[i]
+            idx_i = weight_mat.min(0)[1]
+            idx_j = torch.arange(_size).to(device)
+            indices.append((idx_i, idx_j))
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+def build_matcher(args):
+    assert args.matcher_type in ['HungarianMatcher', 'SimpleMinsumMatcher'], "Unknown args.matcher_type: {}".format(args.matcher_type)
+    if args.matcher_type == 'HungarianMatcher':
+        return HungarianMatcher(
+            cost_class=args.set_cost_class, cost_bbox=args.set_cost_lines, focal_alpha=args.focal_alpha
+        )
+    elif args.matcher_type == 'SimpleMinsumMatcher':
+        return SimpleMinsumMatcher(
+            cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, focal_alpha=args.focal_alpha
+        )
+    else:
+        raise NotImplementedError("Unknown args.matcher_type: {}".format(args.matcher_type))

linea/models/linea/new_dn_components.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# DN-DETR
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+import torch
+from .linea_utils import inverse_sigmoid
+import torch.nn.functional as F
+def prepare_for_cdn(dn_args, training, num_queries, num_classes, hidden_dim, label_enc):
+    """
+        A major difference of DINO from DN-DETR is that the author process pattern embedding pattern embedding in its detector
+        forward function and use learnable tgt embedding, so we change this function a little bit.
+        :param dn_args: targets, dn_number, label_noise_ratio, box_noise_scale
+        :param training: if it is training or inference
+        :param num_queries: number of queires
+        :param num_classes: number of classes
+        :param hidden_dim: transformer hidden dim
+        :param label_enc: encode labels in dn
+        :return:
+        """
+    if training:
+        targets, dn_number, label_noise_ratio, box_noise_scale = dn_args
+        # positive and negative dn queries
+        dn_number = dn_number * 2
+        known = [(torch.ones_like(t['labels'])).cuda() for t in targets]
+        batch_size = len(known)
+        known_num = [sum(k) for k in known]
+        if int(max(known_num)) == 0:
+            dn_number = 1
+        else:
+            if dn_number >= 100:
+                dn_number = dn_number // (int(max(known_num) * 2))
+            elif dn_number < 1:
+                dn_number = 1
+        if dn_number == 0:
+            dn_number = 1
+        unmask_bbox = unmask_label = torch.cat(known)
+        labels = torch.cat([t['labels'] for t in targets])
+        lines = torch.cat([t['lines'] for t in targets])
+        batch_idx = torch.cat([torch.full_like(t['labels'].long(), i) for i, t in enumerate(targets)])
+        known_indice = torch.nonzero(unmask_label + unmask_bbox)
+        known_indice = known_indice.view(-1)
+        known_indice = known_indice.repeat(2 * dn_number, 1).view(-1)
+        known_labels = labels.repeat(2 * dn_number, 1).view(-1)
+        known_bid = batch_idx.repeat(2 * dn_number, 1).view(-1)
+        known_lines = lines.repeat(2 * dn_number, 1)
+        known_labels_expaned = known_labels.clone()
+        known_lines_expand = known_lines.clone()
+        if label_noise_ratio > 0:
+            p = torch.rand_like(known_labels_expaned.float())
+            chosen_indice = torch.nonzero(p < (label_noise_ratio * 0.5)).view(-1)  # half of bbox prob
+            new_label = torch.randint_like(chosen_indice, 0, num_classes)  # randomly put a new one here
+            known_labels_expaned.scatter_(0, chosen_indice, new_label)
+        single_pad = int(max(known_num))
+        pad_size = int(single_pad * 2 * dn_number)
+        positive_idx = torch.tensor(range(len(lines))).long().cuda().unsqueeze(0).repeat(dn_number, 1)
+        positive_idx += (torch.tensor(range(dn_number)) * len(lines) * 2).long().cuda().unsqueeze(1)
+        positive_idx = positive_idx.flatten()
+        negative_idx = positive_idx + len(lines)
+        known_lines_ = known_lines.clone().unflatten(-1, (2, 2))
+        offsets = F.normalize(2 * torch.rand_like(known_lines_) - 1, dim=-1)
+        rand_part = torch.rand(size=(known_lines.shape[0], 2, 1), device=known_lines.device, dtype=offsets.dtype)
+        rand_part[positive_idx] *= 0.005
+        rand_part[negative_idx] *= 0.0645
+        rand_part[negative_idx] += 0.0055
+        known_lines_ = known_lines_ + offsets * rand_part
+        known_lines_ = known_lines_.flatten(-2)
+        known_lines_expand = known_lines_.clamp(min=0.0, max=1.0)
+        # # order: top point > bottom point
+        # #        if same y coordinate, right point > left point
+        # idx = torch.logical_or(known_lines_expand[..., 0] > known_lines_expand[..., 2],
+        #         torch.logical_or(
+        #         known_lines_expand[..., 0] == known_lines_expand[..., 2],
+        #         known_lines_expand[..., 1] < known_lines_expand[..., 3]
+        #         )
+        #     )
+        # known_lines_expand[idx] = known_lines_expand[idx][:, [2, 3, 0, 1]]
+        m = known_labels_expaned.long().to('cuda')
+        input_label_embed = label_enc(m)
+        input_lines_embed = inverse_sigmoid(known_lines_expand)
+        padding_label = torch.zeros(pad_size, hidden_dim).cuda()
+        padding_lines = torch.zeros(pad_size, 4).cuda()
+        input_query_label = padding_label.repeat(batch_size, 1, 1)
+        input_query_lines = padding_lines.repeat(batch_size, 1, 1)
+        map_known_indice = torch.tensor([]).to('cuda')
+        if len(known_num):
+            map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num])  # [1,2, 1,2,3]
+            map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(2 * dn_number)]).long()
+        if len(known_bid):
+            input_query_label[(known_bid.long(), map_known_indice)] = input_label_embed
+            input_query_lines[(known_bid.long(), map_known_indice)] = input_lines_embed
+        tgt_size = pad_size + num_queries
+        attn_mask = torch.ones(tgt_size, tgt_size).to('cuda') < 0
+        # match query cannot see the reconstruct
+        attn_mask[pad_size:, :pad_size] = True
+        # reconstruct cannot see each other
+        for i in range(dn_number):
+            if i == 0:
+                attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True
+            if i == dn_number - 1:
+                attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * i * 2] = True
+            else:
+                attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True
+                attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * 2 * i] = True
+        dn_meta = {
+            'pad_size': pad_size,
+            'num_dn_group': dn_number,
+        }
+    else:
+        input_query_label = None
+        input_query_lines = None
+        attn_mask = None
+        dn_meta = None
+    return input_query_label, input_query_lines, attn_mask, dn_meta
+def dn_post_process(outputs_class, outputs_coord, dn_meta, aux_loss, _set_aux_loss):
+    """
+        post process of dn after output from the transformer
+        put the dn part in the dn_meta
+    """
+    if dn_meta and dn_meta['pad_size'] > 0:
+        output_known_class = outputs_class[:, :, :dn_meta['pad_size'], :]
+        output_known_coord = outputs_coord[:, :, :dn_meta['pad_size'], :]
+        outputs_class = outputs_class[:, :, dn_meta['pad_size']:, :]
+        outputs_coord = outputs_coord[:, :, dn_meta['pad_size']:, :]
+        out = {'pred_logits': output_known_class[-1], 'pred_lines': output_known_coord[-1]}
+        if aux_loss:
+            out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_coord)
+        dn_meta['output_known_lbs_lines'] = out
+    return outputs_class, outputs_coord

linea/models/linea/position_encoding.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+from util.misc import NestedTensor
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+class PositionEmbeddingSineHW(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperatureH = temperatureH
+        self.temperatureW = temperatureW
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_tx = self.temperatureW ** (2 * (dim_tx // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_tx
+        dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_ty = self.temperatureH ** (2 * (dim_ty // 2) / self.num_pos_feats)
+        pos_y = y_embed[:, :, :, None] / dim_ty
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return pos
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2
+    if args.position_embedding in ('v2', 'sine'):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSineHW(
+            N_steps,
+            temperatureH=args.pe_temperatureH,
+            temperatureW=args.pe_temperatureW,
+            normalize=True
+        )
+    elif args.position_embedding in ('v3', 'learned'):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+    return position_embedding

linea/models/linea/utils.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+import torch
+from torch import nn, Tensor
+import math
+import torch.nn.functional as F
+from torch import nn
+def gen_encoder_output_proposals(memory:Tensor, spatial_shapes:Tensor):
+    """
+    Input:
+        - memory: bs, \sum{hw}, d_model
+        - memory_padding_mask: bs, \sum{hw}
+        - spatial_shapes: nlevel, 2
+        - learnedwh: 2
+    Output:
+        - output_memory: bs, \sum{hw}, d_model
+        - output_proposals: bs, \sum{hw}, 4
+    """
+    N_, S_, C_ = memory.shape
+    base_scale = 4.0
+    proposals = []
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+                                        torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
+        grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2
+        scale = torch.tensor([W_, H_], dtype=torch.float32, device=memory.device).view(1, 1, 1, 2).repeat(N_, 1, 1, 1)
+        grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+        # wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+        # proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+        proposal = torch.cat((grid, grid), -1).view(N_, -1, 4)
+        proposals.append(proposal)
+    output_proposals = torch.cat(proposals, 1)
+    output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+    output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid
+    output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
+    output_memory = memory
+    output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+    return output_memory, output_proposals
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob =  inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1-alpha) * (1 - targets)
+        loss = alpha_t * loss
+    return loss.mean(1).sum() / num_boxes
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+def _get_activation_fn(activation, d_model=256, batch_dim=0):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    if activation == "prelu":
+        return nn.PReLU()
+    if activation == "selu":
+        return F.selu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+def gen_sineembed_for_position(pos_tensor, hidden_dim):
+    # n_query, bs, _ = pos_tensor.size()
+    # sineembed_tensor = torch.zeros(n_query, bs, 256)
+    hidden_dim_ = hidden_dim // 2
+    scale = 2 * math.pi
+    dim_t = torch.arange(hidden_dim_, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * (dim_t // 2) / hidden_dim_)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+    w_embed = pos_tensor[:, :, 2] * scale
+    pos_w = w_embed[:, :, None] / dim_t
+    pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+    h_embed = pos_tensor[:, :, 3] * scale
+    pos_h = h_embed[:, :, None] / dim_t
+    pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+    return pos

linea/models/registry.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# -*- coding: utf-8 -*-
+# @Author: Yihao Chen
+# @Date:   2021-08-16 16:03:17
+# @Last Modified by:   Shilong Liu
+# @Last Modified time: 2022-01-23 15:26
+# modified from mmcv
+import inspect
+from functools import partial
+class Registry(object):
+    def __init__(self, name):
+        self._name = name
+        self._module_dict = dict()
+    def __repr__(self):
+        format_str = self.__class__.__name__ + '(name={}, items={})'.format(
+            self._name, list(self._module_dict.keys()))
+        return format_str
+    def __len__(self):
+        return len(self._module_dict)
+    @property
+    def name(self):
+        return self._name
+    @property
+    def module_dict(self):
+        return self._module_dict
+    def get(self, key):
+        return self._module_dict.get(key, None)
+    def registe_with_name(self, module_name=None, force=False):
+        return partial(self.register, module_name=module_name, force=force)
+    def register(self, module_build_function, module_name=None, force=False):
+        """Register a module build function.
+        Args:
+            module (:obj:`nn.Module`): Module to be registered.
+        """
+        if not inspect.isfunction(module_build_function):
+            raise TypeError('module_build_function must be a function, but got {}'.format(
+                type(module_build_function)))
+        if module_name is None:
+            module_name = module_build_function.__name__
+        if not force and module_name in self._module_dict:
+            raise KeyError('{} is already registered in {}'.format(
+                module_name, self.name))
+        self._module_dict[module_name] = module_build_function
+        return module_build_function
+MODULE_BUILD_FUNCS = Registry('model build functions')

linea/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch>=2.0.1
+torchvision>=0.15.2
+scipy
+calflops
+transformers
+tensorboardx
+addict
+yapf
+pycocotools

linea/util/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

linea/util/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (151 Bytes). View file

linea/util/__pycache__/misc.cpython-311.pyc ADDED Viewed

Binary file (15.2 kB). View file

linea/util/__pycache__/slconfig.cpython-311.pyc ADDED Viewed

Binary file (24.6 kB). View file

linea/util/get_param_dicts.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import json
+import torch
+import torch.nn as nn
+import re
+def get_optim_params(cfg: list, model: nn.Module):
+    """
+    E.g.:
+        ^(?=.*a)(?=.*b).*$  means including a and b
+        ^(?=.*(?:a|b)).*$   means including a or b
+        ^(?=.*a)(?!.*b).*$  means including a, but not b
+    """
+    param_groups = []
+    visited = []
+    for pg in cfg:
+        pattern = pg['params']
+        params = {k: v for k, v in model.named_parameters() if v.requires_grad and len(re.findall(pattern, k)) > 0}
+        pg['params'] = params.values()
+        param_groups.append(pg)
+        visited.extend(list(params.keys()))
+    names = [k for k, v in model.named_parameters() if v.requires_grad]
+    if len(visited) < len(names):
+        unseen = set(names) - set(visited)
+        params = {k: v for k, v in model.named_parameters() if v.requires_grad and k in unseen}
+        param_groups.append({'params': params.values()})
+        visited.extend(list(params.keys()))
+    assert len(visited) == len(names), ''
+    return param_groups

linea/util/misc.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Misc functions, including distributed helpers.
+Mostly copy-paste from torchvision references.
+"""
+import os
+import time
+from collections import defaultdict, deque
+import datetime
+from typing import Optional, List
+import torch
+import torch.distributed as dist
+from torch import Tensor
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        if d.shape[0] == 0:
+            return 0
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            # print(name, str(meter))
+            # import ipdb;ipdb.set_trace()
+            if meter.count > 0:
+                loss_str.append(
+                    "{}: {}".format(name, str(meter))
+                )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None, logger=None):
+        if logger is None:
+            print_func = print
+        else:
+            print_func = logger.info
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print_func(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print_func(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print_func('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    try:
+        # https://pytorch.org/docs/stable/elastic/run.html
+        RANK = int(os.getenv('RANK', -1))
+        args.gpu = LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1))
+        WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
+        torch.distributed.init_process_group(init_method='env://')
+        torch.distributed.barrier()
+        rank = torch.distributed.get_rank()
+        torch.cuda.set_device(rank)
+        torch.cuda.empty_cache()
+        args.distributed = True
+        setup_for_distributed(get_rank() == 0)
+        print('Initialized distributed mode...')
+    except:
+        print('Not using distributed mode')
+        args.distributed = False
+        args.world_size = 1
+        args.rank = 0
+        args.local_rank = 0
+        return

linea/util/profiler.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import copy
+from calflops import calculate_flops
+from typing import Tuple
+def stats(
+    model, args,
+    input_shape: Tuple=(1, 3, 640, 640), ) -> Tuple[int, dict]:
+    base_size = args.eval_spatial_size[0]
+    input_shape = (1, 3, base_size, base_size)
+    model_for_info = copy.deepcopy(model).deploy()
+    flops, macs, _ = calculate_flops(model=model_for_info,
+                                        input_shape=input_shape,
+                                        output_as_string=True,
+                                        output_precision=4,
+                                        print_detailed=False)
+    params = sum(p.numel() for p in model_for_info.parameters())
+    del model_for_info
+    return {'flops': flops, 'macs': macs, 'params': params}

linea/util/slconfig.py ADDED Viewed

	@@ -0,0 +1,440 @@

+# ==========================================================
+# Modified from mmcv
+# ==========================================================
+import os, sys
+import os.path as osp
+import ast
+import tempfile
+import shutil
+from importlib import import_module
+from argparse import Action
+from addict import Dict
+from yapf.yapflib.yapf_api import FormatCode
+import platform
+MACOS, LINUX, WINDOWS = (platform.system() == x for x in ['Darwin', 'Linux', 'Windows'])  # environment booleans
+BASE_KEY = '_base_'
+DELETE_KEY = '_delete_'
+RESERVED_KEYS = ['filename', 'text', 'pretty_text', 'get', 'dump', 'merge_from_dict']
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+class ConfigDict(Dict):
+    def __missing__(self, name):
+        raise KeyError(name)
+    def __getattr__(self, name):
+        try:
+            value = super(ConfigDict, self).__getattr__(name)
+        except KeyError:
+            ex = AttributeError(f"'{self.__class__.__name__}' object has no "
+                                f"attribute '{name}'")
+        except Exception as e:
+            ex = e
+        else:
+            return value
+        raise ex
+class SLConfig(object):
+    """
+    config files.
+    only support .py file as config now.
+    ref: mmcv.utils.config
+    Example:
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> cfg.a
+        1
+        >>> cfg.b
+        {'b1': [0, 1]}
+        >>> cfg.b.b1
+        [0, 1]
+        >>> cfg = Config.fromfile('tests/data/config/a.py')
+        >>> cfg.filename
+        "/home/kchen/projects/mmcv/tests/data/config/a.py"
+        >>> cfg.item4
+        'test'
+        >>> cfg
+        "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
+        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
+    """
+    @staticmethod
+    def _validate_py_syntax(filename):
+        with open(filename) as f:
+            content = f.read()
+        try:
+            ast.parse(content)
+        except SyntaxError:
+            raise SyntaxError('There are syntax errors in config '
+                              f'file {filename}')
+    @staticmethod
+    def _file2dict(filename):
+        filename = osp.abspath(osp.expanduser(filename))
+        check_file_exist(filename)
+        if filename.lower().endswith('.py'):
+            with tempfile.TemporaryDirectory() as temp_config_dir:
+                temp_config_file = tempfile.NamedTemporaryFile(
+                    dir=temp_config_dir, suffix='.py')
+                temp_config_name = osp.basename(temp_config_file.name)
+                if WINDOWS:
+                    temp_config_file.close()
+                shutil.copyfile(filename,
+                                osp.join(temp_config_dir, temp_config_name))
+                temp_module_name = osp.splitext(temp_config_name)[0]
+                sys.path.insert(0, temp_config_dir)
+                SLConfig._validate_py_syntax(filename)
+                mod = import_module(temp_module_name)
+                sys.path.pop(0)
+                cfg_dict = {
+                    name: value
+                    for name, value in mod.__dict__.items()
+                    if not name.startswith('__')
+                }
+                # delete imported module
+                del sys.modules[temp_module_name]
+                # close temp file
+                temp_config_file.close()
+        elif filename.lower().endswith(('.yml', '.yaml', '.json')):
+            from .slio import slload
+            cfg_dict = slload(filename)
+        else:
+            raise IOError('Only py/yml/yaml/json type are supported now!')
+        cfg_text = filename + '\n'
+        with open(filename, 'r') as f:
+            cfg_text += f.read()
+        # parse the base file
+        if BASE_KEY in cfg_dict:
+            cfg_dir = osp.dirname(filename)
+            base_filename = cfg_dict.pop(BASE_KEY)
+            base_filename = base_filename if isinstance(
+                base_filename, list) else [base_filename]
+            cfg_dict_list = list()
+            cfg_text_list = list()
+            for f in base_filename:
+                _cfg_dict, _cfg_text = SLConfig._file2dict(osp.join(cfg_dir, f))
+                cfg_dict_list.append(_cfg_dict)
+                cfg_text_list.append(_cfg_text)
+            base_cfg_dict = dict()
+            for c in cfg_dict_list:
+                if len(base_cfg_dict.keys() & c.keys()) > 0:
+                    raise KeyError('Duplicate key is not allowed among bases')
+                    # TODO Allow the duplicate key while warnning user
+                base_cfg_dict.update(c)
+            base_cfg_dict = SLConfig._merge_a_into_b(cfg_dict, base_cfg_dict)
+            cfg_dict = base_cfg_dict
+            # merge cfg_text
+            cfg_text_list.append(cfg_text)
+            cfg_text = '\n'.join(cfg_text_list)
+        return cfg_dict, cfg_text
+    @staticmethod
+    def _merge_a_into_b(a, b):
+        """merge dict `a` into dict `b` (non-inplace).
+            values in `a` will overwrite `b`.
+            copy first to avoid inplace modification
+        Args:
+            a ([type]): [description]
+            b ([type]): [description]
+        Returns:
+            [dict]: [description]
+        """
+        if not isinstance(a, dict):
+            return a
+        b = b.copy()
+        for k, v in a.items():
+            if isinstance(v, dict) and k in b and not v.pop(DELETE_KEY, False):
+                if not isinstance(b[k], dict) and not isinstance(b[k], list):
+                    # if :
+                    raise TypeError(
+                        f'{k}={v} in child config cannot inherit from base '
+                        f'because {k} is a dict in the child config but is of '
+                        f'type {type(b[k])} in base config. You may set '
+                        f'`{DELETE_KEY}=True` to ignore the base config')
+                b[k] = SLConfig._merge_a_into_b(v, b[k])
+            elif isinstance(b, list):
+                try:
+                    _ = int(k)
+                except:
+                    raise TypeError(
+                        f'b is a list, '
+                        f'index {k} should be an int when input but {type(k)}'
+                    )
+                b[int(k)] = SLConfig._merge_a_into_b(v, b[int(k)])
+            else:
+                b[k] = v
+        return b
+    @staticmethod
+    def fromfile(filename):
+        cfg_dict, cfg_text = SLConfig._file2dict(filename)
+        return SLConfig(cfg_dict, cfg_text=cfg_text, filename=filename)
+    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError('cfg_dict must be a dict, but '
+                            f'got {type(cfg_dict)}')
+        for key in cfg_dict:
+            if key in RESERVED_KEYS:
+                raise KeyError(f'{key} is reserved for config file')
+        super(SLConfig, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))
+        super(SLConfig, self).__setattr__('_filename', filename)
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename, 'r') as f:
+                text = f.read()
+        else:
+            text = ''
+        super(SLConfig, self).__setattr__('_text', text)
+    @property
+    def filename(self):
+        return self._filename
+    @property
+    def text(self):
+        return self._text
+    @property
+    def pretty_text(self):
+        indent = 4
+        def _indent(s_, num_spaces):
+            s = s_.split('\n')
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)
+            s = first + '\n' + s
+            return s
+        def _format_basic_types(k, v, use_mapping=False):
+            if isinstance(v, str):
+                v_str = f"'{v}'"
+            else:
+                v_str = str(v)
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent)
+            return attr_str
+        def _format_list(k, v, use_mapping=False):
+            # check if all items in the list are dict
+            if all(isinstance(_, dict) for _ in v):
+                v_str = '[\n'
+                v_str += '\n'.join(
+                    f'dict({_indent(_format_dict(v_), indent)}),'
+                    for v_ in v).rstrip(',')
+                if use_mapping:
+                    k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                    attr_str = f'{k_str}: {v_str}'
+                else:
+                    attr_str = f'{str(k)}={v_str}'
+                attr_str = _indent(attr_str, indent) + ']'
+            else:
+                attr_str = _format_basic_types(k, v, use_mapping)
+            return attr_str
+        def _contain_invalid_identifier(dict_str):
+            contain_invalid_identifier = False
+            for key_name in dict_str:
+                contain_invalid_identifier |= \
+                    (not str(key_name).isidentifier())
+            return contain_invalid_identifier
+        def _format_dict(input_dict, outest_level=False):
+            r = ''
+            s = []
+            use_mapping = _contain_invalid_identifier(input_dict)
+            if use_mapping:
+                r += '{'
+            for idx, (k, v) in enumerate(input_dict.items()):
+                is_last = idx >= len(input_dict) - 1
+                end = '' if outest_level or is_last else ','
+                if isinstance(v, dict):
+                    v_str = '\n' + _format_dict(v)
+                    if use_mapping:
+                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                        attr_str = f'{k_str}: dict({v_str}'
+                    else:
+                        attr_str = f'{str(k)}=dict({v_str}'
+                    attr_str = _indent(attr_str, indent) + ')' + end
+                elif isinstance(v, list):
+                    attr_str = _format_list(k, v, use_mapping) + end
+                else:
+                    attr_str = _format_basic_types(k, v, use_mapping) + end
+                s.append(attr_str)
+            r += '\n'.join(s)
+            if use_mapping:
+                r += '}'
+            return r
+        cfg_dict = self._cfg_dict.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+        # copied from setup.cfg
+        yapf_style = dict(
+            based_on_style='pep8',
+            blank_line_before_nested_class_or_def=True,
+            split_before_expression_after_opening_paren=True)
+        text, _ = FormatCode(text, style_config=yapf_style)#, verify=True)
+        return text
+    def __repr__(self):
+        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
+    def __len__(self):
+        return len(self._cfg_dict)
+    def __getattr__(self, name):
+        # # debug
+        # print('+'*15)
+        # print('name=%s' % name)
+        # print("addr:", id(self))
+        # # print('type(self):', type(self))
+        # print(self.__dict__)
+        # print('+'*15)
+        # if self.__dict__ == {}:
+        #     raise ValueError
+        return getattr(self._cfg_dict, name)
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+    def __iter__(self):
+        return iter(self._cfg_dict)
+    def dump(self, file=None):
+        if file is None:
+            return self.pretty_text
+        else:
+            with open(file, 'w') as f:
+                f.write(self.pretty_text)
+    def merge_from_dict(self, options):
+        """Merge list into cfg_dict
+        Merge the dict parsed by MultipleKVAction into this cfg.
+        Examples:
+            >>> options = {'model.backbone.depth': 50,
+            ...            'model.backbone.with_cp':True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(
+            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
+        Args:
+            options (dict): dict of configs to merge from.
+        """
+        option_cfg_dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split('.')
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+        cfg_dict = super(SLConfig, self).__getattribute__('_cfg_dict')
+        super(SLConfig, self).__setattr__(
+            '_cfg_dict', SLConfig._merge_a_into_b(option_cfg_dict, cfg_dict))
+    # for multiprocess
+    def __setstate__(self, state):
+        self.__init__(state)
+    def copy(self):
+        return SLConfig(self._cfg_dict.copy())
+    def deepcopy(self):
+        return SLConfig(self._cfg_dict.deepcopy())
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options should
+    be passed as comma separated values, i.e KEY=V1,V2,V3
+    """
+    @staticmethod
+    def _parse_int_float_bool(val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        if val.lower() in ['none', 'null']:
+            return None
+        return val
+    def __call__(self, parser, namespace, values, option_string=None):
+        options = {}
+        for kv in values:
+            key, val = kv.split('=', maxsplit=1)
+            val = [self._parse_int_float_bool(v) for v in val.split(',')]
+            if len(val) == 1:
+                val = val[0]
+            options[key] = val
+        setattr(namespace, self.dest, options)