# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import Conv2d, ConvModule
from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention
from mmengine.model import (BaseModule, ModuleList, caffe2_xavier_init,
normal_init, xavier_init)
from torch import Tensor
from mmdet.registry import MODELS
from mmdet.utils import ConfigType, OptMultiConfig
from ..task_modules.prior_generators import MlvlPointGenerator
from .positional_encoding import SinePositionalEncoding
from .transformer import Mask2FormerTransformerEncoder
@MODELS.register_module()
class MSDeformAttnPixelDecoder(BaseModule):
"""Pixel decoder with multi-scale deformable attention.
Args:
in_channels (list[int] | tuple[int]): Number of channels in the
input feature maps.
strides (list[int] | tuple[int]): Output strides of feature from
backbone.
feat_channels (int): Number of channels for feature.
out_channels (int): Number of channels for output.
num_outs (int): Number of output scales.
norm_cfg (:obj:`ConfigDict` or dict): Config for normalization.
Defaults to dict(type='GN', num_groups=32).
act_cfg (:obj:`ConfigDict` or dict): Config for activation.
Defaults to dict(type='ReLU').
encoder (:obj:`ConfigDict` or dict): Config for transformer
encoder. Defaults to None.
positional_encoding (:obj:`ConfigDict` or dict): Config for
transformer encoder position encoding. Defaults to
dict(num_feats=128, normalize=True).
init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
dict], optional): Initialization config dict. Defaults to None.
"""
def __init__(self,
in_channels: Union[List[int],
Tuple[int]] = [256, 512, 1024, 2048],
strides: Union[List[int], Tuple[int]] = [4, 8, 16, 32],
feat_channels: int = 256,
out_channels: int = 256,
num_outs: int = 3,
norm_cfg: ConfigType = dict(type='GN', num_groups=32),
act_cfg: ConfigType = dict(type='ReLU'),
encoder: ConfigType = None,
positional_encoding: ConfigType = dict(
num_feats=128, normalize=True),
init_cfg: OptMultiConfig = None) -> None:
super().__init__(init_cfg=init_cfg)
self.strides = strides
self.num_input_levels = len(in_channels)
self.num_encoder_levels = \
encoder.layer_cfg.self_attn_cfg.num_levels
assert self.num_encoder_levels >= 1, \
'num_levels in attn_cfgs must be at least one'
input_conv_list = []
# from top to down (low to high resolution)
for i in range(self.num_input_levels - 1,
self.num_input_levels - self.num_encoder_levels - 1,
-1):
input_conv = ConvModule(
in_channels[i],
feat_channels,
kernel_size=1,
norm_cfg=norm_cfg,
act_cfg=None,
bias=True)
input_conv_list.append(input_conv)
self.input_convs = ModuleList(input_conv_list)
self.encoder = Mask2FormerTransformerEncoder(**encoder)
self.postional_encoding = SinePositionalEncoding(**positional_encoding)
# high resolution to low resolution
self.level_encoding = nn.Embedding(self.num_encoder_levels,
feat_channels)
# fpn-like structure
self.lateral_convs = ModuleList()
self.output_convs = ModuleList()
self.use_bias = norm_cfg is None
# from top to down (low to high resolution)
# fpn for the rest features that didn't pass in encoder
for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
-1):
lateral_conv = ConvModule(
in_channels[i],
feat_channels,
kernel_size=1,
bias=self.use_bias,
norm_cfg=norm_cfg,
act_cfg=None)
output_conv = ConvModule(
feat_channels,
feat_channels,
kernel_size=3,
stride=1,
padding=1,
bias=self.use_bias,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.lateral_convs.append(lateral_conv)
self.output_convs.append(output_conv)
self.mask_feature = Conv2d(
feat_channels, out_channels, kernel_size=1, stride=1, padding=0)
self.num_outs = num_outs
self.point_generator = MlvlPointGenerator(strides)
def init_weights(self) -> None:
"""Initialize weights."""
for i in range(0, self.num_encoder_levels):
xavier_init(
self.input_convs[i].conv,
gain=1,
bias=0,
distribution='uniform')
for i in range(0, self.num_input_levels - self.num_encoder_levels):
caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
caffe2_xavier_init(self.output_convs[i].conv, bias=0)
caffe2_xavier_init(self.mask_feature, bias=0)
normal_init(self.level_encoding, mean=0, std=1)
for p in self.encoder.parameters():
if p.dim() > 1:
nn.init.xavier_normal_(p)
# init_weights defined in MultiScaleDeformableAttention
for m in self.encoder.layers.modules():
if isinstance(m, MultiScaleDeformableAttention):
m.init_weights()
def forward(self, feats: List[Tensor]) -> Tuple[Tensor, Tensor]:
"""
Args:
feats (list[Tensor]): Feature maps of each level. Each has
shape of (batch_size, c, h, w).
Returns:
tuple: A tuple containing the following:
- mask_feature (Tensor): shape (batch_size, c, h, w).
- multi_scale_features (list[Tensor]): Multi scale \
features, each in shape (batch_size, c, h, w).
"""
# generate padding mask for each level, for each image
batch_size = feats[0].shape[0]
encoder_input_list = []
padding_mask_list = []
level_positional_encoding_list = []
spatial_shapes = []
reference_points_list = []
for i in range(self.num_encoder_levels):
level_idx = self.num_input_levels - i - 1
feat = feats[level_idx]
feat_projected = self.input_convs[i](feat)
feat_hw = torch._shape_as_tensor(feat)[2:].to(feat.device)
# no padding
padding_mask_resized = feat.new_zeros(
(batch_size, ) + feat.shape[-2:], dtype=torch.bool)
pos_embed = self.postional_encoding(padding_mask_resized)
level_embed = self.level_encoding.weight[i]
level_pos_embed = level_embed.view(1, -1, 1, 1) + pos_embed
# (h_i * w_i, 2)
reference_points = self.point_generator.single_level_grid_priors(
feat.shape[-2:], level_idx, device=feat.device)
# normalize
feat_wh = feat_hw.unsqueeze(0).flip(dims=[0, 1])
factor = feat_wh * self.strides[level_idx]
reference_points = reference_points / factor
# shape (batch_size, c, h_i, w_i) -> (h_i * w_i, batch_size, c)
feat_projected = feat_projected.flatten(2).permute(0, 2, 1)
level_pos_embed = level_pos_embed.flatten(2).permute(0, 2, 1)
padding_mask_resized = padding_mask_resized.flatten(1)
encoder_input_list.append(feat_projected)
padding
聚财猫猫
- 粉丝: 248
- 资源: 221
最新资源
- 机械设计单轴变位机设计sw21非常好的设计图纸100%好用.zip
- 高德地图与58租房程序整合操作指南
- OAI 5G基站配置文件
- (工程项目线上支持)预瞄跟踪控制算法,单点或多点驾驶员模型,横制,纯跟踪算法 carsim和MATLAB Simulink联合仿真 附建模说明书
- 电信10000管家专用测速软件 免安装
- 机械设计等离子反应器sw18可编辑非常好的设计图纸100%好用.zip
- 开心麻花影视作品分析程序操作指南及应用场景
- 机械设计点针式打标设备sw17可编辑非常好的设计图纸100%好用.zip
- 污水处理程序 工厂污水处理控制系统 西门子PLC200smart和上位机wincc(版本号V7.4)污水处理控制系统,带图纸,带分配点位,带管道图,带PLC程序,带上位机程序,上位机画面,真实工程项
- 机械设计电动切割机X_T非常好的设计图纸100%好用.zip
- go+wails 常见加密解密工具集合
- Excel数据分析师程序操作指南与应用实例
- 《初等数论第二版》思维导图
- YouTube最受欢迎的100个频道数据,视频网站频道排行数据,油管视频数据
- 使用 HTML 和 CSS 创建简易且美观的圣诞树网页效果
- labview视觉检测,一个相机,两个相机,抓边,找圆,一套代码任意切 采用halcon模板匹配
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈