open-mmlab
diff --git a/‎README.md
+5 b/‎README.md
+5
diff --git a/‎pcdet/datasets/waymo/waymo_dataset.py
+5-1 b/‎pcdet/datasets/waymo/waymo_dataset.py
+5-1
diff --git a/‎pcdet/models/backbones_2d/__init__.py
+3-2 b/‎pcdet/models/backbones_2d/__init__.py
+3-2
diff --git a/‎pcdet/models/backbones_2d/base_bev_backbone.py
+147 b/‎pcdet/models/backbones_2d/base_bev_backbone.py
+147
diff --git a/‎pcdet/models/backbones_2d/map_to_bev/__init__.py
+3-2 b/‎pcdet/models/backbones_2d/map_to_bev/__init__.py
+3-2
diff --git a/‎pcdet/models/backbones_2d/map_to_bev/pointpillar_scatter.py
+36 b/‎pcdet/models/backbones_2d/map_to_bev/pointpillar_scatter.py
+36
diff --git a/‎pcdet/models/backbones_3d/__init__.py
+3-1 b/‎pcdet/models/backbones_3d/__init__.py
+3-1
@@ -23,6 +23,8 @@ It is also the official code release of [`[PointRCNN]`](https://arxiv.org/abs/18
 
 
 ## Changelog
+[2023-06-xx] **NEW:** Added support for [`DSVT`](https://arxiv.org/abs/2301.06051), which achieves state-of-the-art performance on large-scale Waymo Open Dataset with real-time inference speed (27HZ with TensorRt).
+
 [2023-05-13] **NEW:** Added support for the multi-modal 3D object detection models on Nuscenes dataset.  
 * Support multi-modal Nuscenes detection (See the [GETTING_STARTED.md](docs/GETTING_STARTED.md) to process data).
 * Support [TransFusion-Lidar](https://arxiv.org/abs/2203.11496) head, which ahcieves 69.43% NDS on Nuscenes validation dataset.
@@ -192,6 +194,8 @@ Here we also provide the performance of several models trained on the full train
 | [PV-RCNN (CenterHead)](tools/cfgs/waymo_models/pv_rcnn_with_centerhead_rpn.yaml)          | 78.00/77.50 | 69.43/68.98 | 79.21/73.03 | 70.42/64.72 | 71.46/70.27 | 68.95/67.79 |
 | [PV-RCNN++](tools/cfgs/waymo_models/pv_rcnn_plusplus.yaml)                                | 79.10/78.63 | 70.34/69.91 | 80.62/74.62 | 71.86/66.30 | 73.49/72.38 | 70.70/69.62 |
 | [PV-RCNN++ (ResNet)](tools/cfgs/waymo_models/pv_rcnn_plusplus_resnet.yaml)                | 79.25/78.78 | 70.61/70.18 | 81.83/76.28 | 73.17/68.00 | 73.72/72.66 | 71.21/70.19 |
+| [DSVT-Pillar](tools/cfgs/waymo_models/dsvt_pillar.yaml)                             | 79.44/78.97 | 71.24/70.81 | 83.00/77.22 | 75.45/69.95 | 76.70/75.70 | 73.83/72.86 |
+| [DSVT-Voxel](tools/cfgs/waymo_models/dsvt_voxel.yaml)                             | 79.77/79.31 | 71.67/71.25 | 83.75/78.92 | 76.21/71.57 | 77.57/76.58 | 74.70/73.73 |
 | [PV-RCNN++ (ResNet, 2 frames)](tools/cfgs/waymo_models/pv_rcnn_plusplus_resnet_2frames.yaml) | 80.17/79.70 | 72.14/71.70 | 83.48/80.42 | 75.54/72.61 | 74.63/73.75 | 72.35/71.50 |
 | [MPPNet (4 frames)](docs/guidelines_of_approaches/mppnet.md)                              | 81.54/81.06 | 74.07/73.61 | 84.56/81.94 | 77.20/74.67 | 77.15/76.50 | 75.01/74.38 |
 | [MPPNet (16 frames)](docs/guidelines_of_approaches/mppnet.md)                             | 82.74/82.28 | 75.41/74.96 | 84.69/82.25 | 77.43/75.06 | 77.28/76.66 | 75.13/74.52 |
@@ -201,6 +205,7 @@ Here we also provide the performance of several models trained on the full train
 
 
 
+
 We could not provide the above pretrained models due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), 
 but you could easily achieve similar performance by training with the default configs.
 
 
@@ -200,7 +200,11 @@ def get_lidar(self, sequence_name, sample_idx):
         points_all, NLZ_flag = point_features[:, 0:5], point_features[:, 5]
         if not self.dataset_cfg.get('DISABLE_NLZ_FLAG_ON_POINTS', False):
             points_all = points_all[NLZ_flag == -1]
-        points_all[:, 3] = np.tanh(points_all[:, 3])
+        if self.dataset_cfg.get('POINTS_TANH_DIM', None) is None:
+            points_all[:, 3] = np.tanh(points_all[:, 3])
+        else:
+            for dim_idx in self.dataset_cfg.POINTS_TANH_DIM:
+                points_all[:, dim_idx] = np.tanh(points_all[:, dim_idx])
         return points_all
 
     @staticmethod
 
@@ -1,6 +1,7 @@
-from .base_bev_backbone import BaseBEVBackbone, BaseBEVBackboneV1
+from .base_bev_backbone import BaseBEVBackbone, BaseBEVBackboneV1, BaseBEVResBackbone
 
 __all__ = {
     'BaseBEVBackbone': BaseBEVBackbone,
-    'BaseBEVBackboneV1': BaseBEVBackboneV1
+    'BaseBEVBackboneV1': BaseBEVBackboneV1,
+    'BaseBEVResBackbone': BaseBEVResBackbone,
 }
@@ -202,3 +202,150 @@ def forward(self, data_dict):
         data_dict['spatial_features_2d'] = x
 
         return data_dict
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        padding: int = 1,
+        downsample: bool = False,
+    ) -> None:
+        super().__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=padding, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, eps=1e-3, momentum=0.01)
+        self.relu1 = nn.ReLU()
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-3, momentum=0.01)
+        self.relu2 = nn.ReLU()
+        self.downsample = downsample
+        if self.downsample:
+            self.downsample_layer = nn.Sequential(
+                nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, padding=0, bias=False),
+                nn.BatchNorm2d(planes, eps=1e-3, momentum=0.01)
+            )
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample:
+            identity = self.downsample_layer(x)
+
+        out += identity
+        out = self.relu2(out)
+
+        return out
+
+
+class BaseBEVResBackbone(nn.Module):
+    def __init__(self, model_cfg, input_channels):
+        super().__init__()
+        self.model_cfg = model_cfg
+
+        if self.model_cfg.get('LAYER_NUMS', None) is not None:
+            assert len(self.model_cfg.LAYER_NUMS) == len(self.model_cfg.LAYER_STRIDES) == len(self.model_cfg.NUM_FILTERS)
+            layer_nums = self.model_cfg.LAYER_NUMS
+            layer_strides = self.model_cfg.LAYER_STRIDES
+            num_filters = self.model_cfg.NUM_FILTERS
+        else:
+            layer_nums = layer_strides = num_filters = []
+
+        if self.model_cfg.get('UPSAMPLE_STRIDES', None) is not None:
+            assert len(self.model_cfg.UPSAMPLE_STRIDES) == len(self.model_cfg.NUM_UPSAMPLE_FILTERS)
+            num_upsample_filters = self.model_cfg.NUM_UPSAMPLE_FILTERS
+            upsample_strides = self.model_cfg.UPSAMPLE_STRIDES
+        else:
+            upsample_strides = num_upsample_filters = []
+
+        num_levels = len(layer_nums)
+        c_in_list = [input_channels, *num_filters[:-1]]
+        self.blocks = nn.ModuleList()
+        self.deblocks = nn.ModuleList()
+        for idx in range(num_levels):
+            cur_layers = [
+                # nn.ZeroPad2d(1),
+                BasicBlock(c_in_list[idx], num_filters[idx], layer_strides[idx], 1, True)
+            ]
+            for k in range(layer_nums[idx]):
+                cur_layers.extend([
+                    BasicBlock(num_filters[idx], num_filters[idx])
+                ])
+            self.blocks.append(nn.Sequential(*cur_layers))
+            if len(upsample_strides) > 0:
+                stride = upsample_strides[idx]
+                if stride >= 1:
+                    self.deblocks.append(nn.Sequential(
+                        nn.ConvTranspose2d(
+                            num_filters[idx], num_upsample_filters[idx],
+                            upsample_strides[idx],
+                            stride=upsample_strides[idx], bias=False
+                        ),
+                        nn.BatchNorm2d(num_upsample_filters[idx], eps=1e-3, momentum=0.01),
+                        nn.ReLU()
+                    ))
+                else:
+                    stride = np.round(1 / stride).astype(np.int)
+                    self.deblocks.append(nn.Sequential(
+                        nn.Conv2d(
+                            num_filters[idx], num_upsample_filters[idx],
+                            stride,
+                            stride=stride, bias=False
+                        ),
+                        nn.BatchNorm2d(num_upsample_filters[idx], eps=1e-3, momentum=0.01),
+                        nn.ReLU()
+                    ))
+
+        c_in = sum(num_upsample_filters) if len(num_upsample_filters) > 0 else sum(num_filters)
+        if len(upsample_strides) > num_levels:
+            self.deblocks.append(nn.Sequential(
+                nn.ConvTranspose2d(c_in, c_in, upsample_strides[-1], stride=upsample_strides[-1], bias=False),
+                nn.BatchNorm2d(c_in, eps=1e-3, momentum=0.01),
+                nn.ReLU(),
+            ))
+
+        self.num_bev_features = c_in
+
+    def forward(self, data_dict):
+        """
+        Args:
+            data_dict:
+                spatial_features
+        Returns:
+        """
+        spatial_features = data_dict['spatial_features']
+        ups = []
+        ret_dict = {}
+        x = spatial_features
+        for i in range(len(self.blocks)):
+            x = self.blocks[i](x)
+
+            stride = int(spatial_features.shape[2] / x.shape[2])
+            ret_dict['spatial_features_%dx' % stride] = x
+            if len(self.deblocks) > 0:
+                ups.append(self.deblocks[i](x))
+            else:
+                ups.append(x)
+
+        if len(ups) > 1:
+            x = torch.cat(ups, dim=1)
+        elif len(ups) == 1:
+            x = ups[0]
+
+        if len(self.deblocks) > len(self.blocks):
+            x = self.deblocks[-1](x)
+
+        data_dict['spatial_features_2d'] = x
+
+        return data_dict
@@ -1,9 +1,10 @@
 from .height_compression import HeightCompression
-from .pointpillar_scatter import PointPillarScatter
+from .pointpillar_scatter import PointPillarScatter, PointPillarScatter3d
 from .conv2d_collapse import Conv2DCollapse
 
 __all__ = {
     'HeightCompression': HeightCompression,
     'PointPillarScatter': PointPillarScatter,
-    'Conv2DCollapse': Conv2DCollapse
+    'Conv2DCollapse': Conv2DCollapse,
+    'PointPillarScatter3d': PointPillarScatter3d,
 }
@@ -35,3 +35,39 @@ def forward(self, batch_dict, **kwargs):
         batch_spatial_features = batch_spatial_features.view(batch_size, self.num_bev_features * self.nz, self.ny, self.nx)
         batch_dict['spatial_features'] = batch_spatial_features
         return batch_dict
+
+
+class PointPillarScatter3d(nn.Module):
+    def __init__(self, model_cfg, grid_size, **kwargs):
+        super().__init__()
+        
+        self.model_cfg = model_cfg
+        self.nx, self.ny, self.nz = self.model_cfg.INPUT_SHAPE
+        self.num_bev_features = self.model_cfg.NUM_BEV_FEATURES
+        self.num_bev_features_before_compression = self.model_cfg.NUM_BEV_FEATURES // self.nz
+
+    def forward(self, batch_dict, **kwargs):
+        pillar_features, coords = batch_dict['pillar_features'], batch_dict['voxel_coords']
+        
+        batch_spatial_features = []
+        batch_size = coords[:, 0].max().int().item() + 1
+        for batch_idx in range(batch_size):
+            spatial_feature = torch.zeros(
+                self.num_bev_features_before_compression,
+                self.nz * self.nx * self.ny,
+                dtype=pillar_features.dtype,
+                device=pillar_features.device)
+
+            batch_mask = coords[:, 0] == batch_idx
+            this_coords = coords[batch_mask, :]
+            indices = this_coords[:, 1] * self.ny * self.nx + this_coords[:, 2] * self.nx + this_coords[:, 3]
+            indices = indices.type(torch.long)
+            pillars = pillar_features[batch_mask, :]
+            pillars = pillars.t()
+            spatial_feature[:, indices] = pillars
+            batch_spatial_features.append(spatial_feature)
+
+        batch_spatial_features = torch.stack(batch_spatial_features, 0)
+        batch_spatial_features = batch_spatial_features.view(batch_size, self.num_bev_features_before_compression * self.nz, self.ny, self.nx)
+        batch_dict['spatial_features'] = batch_spatial_features
+        return batch_dict
@@ -5,6 +5,7 @@
 from .spconv_backbone_voxelnext import VoxelResBackBone8xVoxelNeXt
 from .spconv_backbone_voxelnext2d import VoxelResBackBone8xVoxelNeXt2D
 from .spconv_unet import UNetV2
+from .dsvt import DSVT
 
 __all__ = {
     'VoxelBackBone8x': VoxelBackBone8x,
@@ -16,5 +17,6 @@
     'VoxelResBackBone8xVoxelNeXt': VoxelResBackBone8xVoxelNeXt,
     'VoxelResBackBone8xVoxelNeXt2D': VoxelResBackBone8xVoxelNeXt2D,
     'PillarBackBone8x': PillarBackBone8x,
-    'PillarRes18BackBone8x': PillarRes18BackBone8x
+    'PillarRes18BackBone8x': PillarRes18BackBone8x,
+    'DSVT': DSVT,
 }
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`		`-from .base_bev_backbone import BaseBEVBackbone, BaseBEVBackboneV1`
	`1`	`+from .base_bev_backbone import BaseBEVBackbone, BaseBEVBackboneV1, BaseBEVResBackbone`
`2`	`2`
`3`	`3`	`__all__ = {`
`4`	`4`	`'BaseBEVBackbone': BaseBEVBackbone,`
`5`		`- 'BaseBEVBackboneV1': BaseBEVBackboneV1`
	`5`	`+ 'BaseBEVBackboneV1': BaseBEVBackboneV1,`
	`6`	`+ 'BaseBEVResBackbone': BaseBEVResBackbone,`
`6`	`7`	`}`