Skip to content

Export scn.onnx from centerpoint which is trained on KITTI dataset, the output of opeartor scatterDense in tensorRT inference is all zeros #336

@hitbuyi

Description

@hitbuyi

I trained centerpoint model on KITTI dataset, 3D backone is VoxelBackBone8x (not SpMiddleResNetFHD), using framework of this repository to export scn.onnx successfully, pth models of scn before operation fusion and after fusion are consistent. the fused scn.onnx is as below

Image

both VoxelBackBone8x and SpMiddleResNetFHD are based on Sparseconvolution

  1. scn.onnx exporting enviroment
     spconv-cu113              2.3.6
     protobuf                      4.25.3
     torch                     2.2.2+cu118
     onnx                      1.18.0
     onnxruntime               1.20.0

1, in CUDA-centerpoint, I change source code to complie with KITTI dataset. delete speed related code since KITTI dataset has no speed labels, change point cloud range,etc
1.1 commmon.h is changed as below

 const unsigned int MAX_DET_NUM = 1000;  
//const unsigned int DET_CHANNEL = 11;
const unsigned int DET_CHANNEL = 9; // KITTI dataset ,delete vx,vy
const unsigned int MAX_POINTS_NUM = 300000;
const unsigned int NUM_TASKS = 6;

#define checkCudaErrors(op)                                                                  \
  {                                                                                          \
    auto status = ((op));                                                                    \
    if (status != 0) {                                                                       \
      std::cout << "Cuda failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \
                << ":" << __LINE__ << " error status: " << status << std::endl;              \
      abort();                                                                               \
    }                                                                                        \
  }


class Params
{
  public:
    //const unsigned int task_num_stride[NUM_TASKS] = { 0, 1, 3, 5, 6, 8, };
    const unsigned int task_num_stride[NUM_TASKS] = { 0, 1,}; // KITTI centerpoint 
    static const unsigned int num_classes = 3;
    //const char *class_name[num_classes] = { "car", "truck", "construction_vehicle", "bus", "trailer", "barrier", "motorcycle", "bicycle", "pedestrian", "traffic_cone"};
    const char *class_name[num_classes] = { "Car", "Pedestrian", "Cyclist"};

    const float out_size_factor = 4;  
    const float voxel_size[2] = { 0.05, 0.05, };
    const float pc_range[2] = { 0, 40, };
    const float score_threshold = 0.1;
    //const float post_center_range[6] = { -61.2, -61.2, -10.0, 61.2, 61.2, 10.0, };
    const float post_center_range[6] = { 0.0f, -40.0f, -3.0f, 70.4f, 40.0f, 1.0f }; // KITTI 
    const float nms_iou_threshold = 0.2;
    const unsigned int nms_pre_max_size = MAX_DET_NUM;
    const unsigned int nms_post_max_size= 83;

    const float min_x_range = 0;
    const float max_x_range = 70.4;
    const float min_y_range = -40.0;
    const float max_y_range = 40.0;
    const float min_z_range = -3.0;
    const float max_z_range = 1.0;
    //the size of a pillar
    const float pillar_x_size = 0.05;
    const float pillar_y_size = 0.05;
    const float pillar_z_size = 0.1;
    const int max_points_per_voxel = 10;

    const unsigned int max_voxels = 160000;
    //const unsigned int feature_num = 5;
    const unsigned int feature_num = 4;

    Params() {};

    int getGridXSize() {
      return (int)std::round((max_x_range - min_x_range) / pillar_x_size);
    }
    int getGridYSize() {
      return (int)std::round((max_y_range - min_y_range) / pillar_y_size);
    }
    int getGridZSize() {
      return (int)std::round((max_z_range - min_z_range) / pillar_z_size);
    }
};

1.2 centerpoint.h is changed as below

//typedef struct float11 { float val[11]; } float11;
typedef struct float11 { float val[9]; } float9;   // use float9

1.3 centerpoint.cpp is changed as below

extern void save_scn_out(spconv::DTensor* result, const std::string& path);

template<typename T>
double getAverage(std::vector<T> const& v) {
    if (v.empty()) {
        return 0;
    }
    return std::accumulate(v.begin(), v.end(), 0.0) / v.size();
}

CenterPoint::CenterPoint(std::string modelFile, bool verbose): verbose_(verbose)
{
    trt_ = TensorRT::load(modelFile);
    if(trt_ == nullptr) abort();

    pre_.reset(new PreProcessCuda());
    post_.reset(new PostProcessCuda());

    //scn_engine_ = spconv::load_engine_from_onnx("../model/centerpoint.scn.onnx");
    scn_engine_ = spconv::load_engine_from_onnx("../model/centerpoint_scn.onnx");

    checkCudaErrors(cudaMallocHost((void **)&h_detections_num_, sizeof(unsigned int)));
    checkCudaErrors(cudaMemset(h_detections_num_, 0, sizeof(unsigned int)));

    checkCudaErrors(cudaMalloc((void **)&d_detections_, MAX_DET_NUM * DET_CHANNEL * sizeof(float)));
    checkCudaErrors(cudaMemset(d_detections_, 0, MAX_DET_NUM * DET_CHANNEL * sizeof(float)));

    //add d_detections_reshape_
    checkCudaErrors(cudaMalloc((void **)&d_detections_reshape_, MAX_DET_NUM * DET_CHANNEL * sizeof(float)));
    checkCudaErrors(cudaMemset(d_detections_reshape_, 0, MAX_DET_NUM * DET_CHANNEL * sizeof(float)));

    //detections_.resize(MAX_DET_NUM, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
    detections_.resize(MAX_DET_NUM, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); // KITTI, use 9 data

    for(unsigned int i=0; i < NUM_TASKS; i++) {
        checkCudaErrors(cudaMalloc((void **)&d_reg_[i], trt_->getBindingNumel("reg_" + std::to_string(i)) * sizeof(half)));
        checkCudaErrors(cudaMalloc((void **)&d_height_[i], trt_->getBindingNumel("height_" + std::to_string(i)) * sizeof(half)));
        checkCudaErrors(cudaMalloc((void **)&d_dim_[i], trt_->getBindingNumel("dim_" + std::to_string(i)) * sizeof(half)));
        checkCudaErrors(cudaMalloc((void **)&d_rot_[i], trt_->getBindingNumel("rot_" + std::to_string(i)) * sizeof(half)));
        //checkCudaErrors(cudaMalloc((void **)&d_vel_[i], trt_->getBindingNumel("vel_" + std::to_string(i)) * sizeof(half)));
        checkCudaErrors(cudaMalloc((void **)&d_hm_[i], trt_->getBindingNumel("hm_" + std::to_string(i)) * sizeof(half)));

        if(i==0){
            auto d = trt_->getBindingDims("reg_" + std::to_string(i));
            reg_n_ = d[0];
            reg_c_ = d[1];
            reg_h_ = d[2];
            reg_w_ = d[3];

            d = trt_->getBindingDims("height_" + std::to_string(i));
            height_c_ = d[1];
            d = trt_->getBindingDims("dim_" + std::to_string(i));
            dim_c_ = d[1];
            d = trt_->getBindingDims("rot_" + std::to_string(i));
            rot_c_ = d[1];
            //d = trt_->getBindingDims("vel_" + std::to_string(i));
            //vel_c_ = d[1];
        }
        auto d = trt_->getBindingDims("hm_" + std::to_string(i));
        hm_c_[i] = d[1];
    }
    h_mask_size_ = params_.nms_pre_max_size * DIVUP(params_.nms_pre_max_size, NMS_THREADS_PER_BLOCK) * sizeof(uint64_t);
    checkCudaErrors(cudaMallocHost((void **)&h_mask_, h_mask_size_));
    checkCudaErrors(cudaMemset(h_mask_, 0, h_mask_size_));
    return;
}

CenterPoint::~CenterPoint(void)
{
    pre_.reset();
    trt_.reset();
    post_.reset();
    scn_engine_.reset();

    checkCudaErrors(cudaFreeHost(h_detections_num_));
    checkCudaErrors(cudaFree(d_detections_));
    checkCudaErrors(cudaFree(d_detections_reshape_)); 

    for (unsigned int i=0; i < NUM_TASKS; i++) {
        checkCudaErrors(cudaFree(d_reg_[i]));
        checkCudaErrors(cudaFree(d_height_[i]));
        checkCudaErrors(cudaFree(d_dim_[i]));
        checkCudaErrors(cudaFree(d_rot_[i]));
        //checkCudaErrors(cudaFree(d_vel_[i]));
        checkCudaErrors(cudaFree(d_hm_[i]));
    }

    checkCudaErrors(cudaFreeHost(h_mask_));
    return;
}

int CenterPoint::prepare(){
    pre_->alloc_resource();
    return 0;
}

int CenterPoint::doinfer(void* points, unsigned int point_num, cudaStream_t stream)
{
    float elapsedTime = 0.0f;

    timer_.start(stream);
    pre_->generateVoxels((float *)points, point_num, stream);
    timing_pre_.push_back(timer_.stop("Voxelization", verbose_));

    unsigned int valid_num = pre_->getOutput(&d_voxel_features, &d_voxel_indices, sparse_shape);
    if (verbose_) {
        std::cout << "valid_num: " << valid_num <<std::endl;
    }

    timer_.start(stream);
    auto result = scn_engine_->forward(
        {valid_num, 4}, spconv::DType::Float16, d_voxel_features,
        {valid_num, 4}, spconv::DType::Int32,   d_voxel_indices,
        1, sparse_shape, stream
    );


    save_scn_out(result, "../scn_out_cpp.txt");
    timing_scn_engine_.push_back(timer_.stop("3D Backbone", verbose_));

    timer_.start(stream);
    //trt_->forward({result->features_data(), d_reg_[0], d_height_[0], d_dim_[0], d_rot_[0], d_vel_[0], d_hm_[0],
    //                                            d_reg_[1], d_height_[1], d_dim_[1], d_rot_[1], d_vel_[1], d_hm_[1],
    //                                            d_reg_[2], d_height_[2], d_dim_[2], d_rot_[2], d_vel_[2], d_hm_[2],
    //                                            d_reg_[3], d_height_[3], d_dim_[3], d_rot_[3], d_vel_[3], d_hm_[3],
    //                                            d_reg_[4], d_height_[4], d_dim_[4], d_rot_[4], d_vel_[4], d_hm_[4],
    //                                            d_reg_[5], d_height_[5], d_dim_[5], d_rot_[5], d_vel_[5], d_hm_[5]}, stream);

    trt_->forward({result->features_data(), d_reg_[0], d_height_[0], d_dim_[0], d_rot_[0], d_vel_[0], d_hm_[0],
                                            d_reg_[1], d_height_[1], d_dim_[1], d_rot_[1], d_vel_[1], d_hm_[1]}, stream);

    timing_trt_.push_back(timer_.stop("RPN + Head", verbose_));
    nms_pred_.clear();

    timer_.start(stream);
    for(unsigned int i_task =0; i_task < NUM_TASKS; i_task++) {
        checkCudaErrors(cudaMemset(h_detections_num_, 0, sizeof(unsigned int)));
        checkCudaErrors(cudaMemset(d_detections_, 0, MAX_DET_NUM * DET_CHANNEL * sizeof(float)));
        checkCudaErrors(cudaMemset(d_detections_reshape_, 0, MAX_DET_NUM * DET_CHANNEL * sizeof(float)));

        d_vel_[i_task] = nullptr;
        vel_c_ = 0;

        post_->doPostDecodeCuda(reg_n_, reg_h_, reg_w_, reg_c_, height_c_, dim_c_, rot_c_, vel_c_, hm_c_[i_task],
                                d_reg_[i_task],
                                d_height_[i_task],
                                d_dim_[i_task],
                                d_rot_[i_task],
                                d_vel_[i_task],
                                d_hm_[i_task],
                                h_detections_num_,
                                d_detections_, stream);
        if(*h_detections_num_ == 0) continue;

        checkCudaErrors(cudaMemcpyAsync(detections_.data(), d_detections_, MAX_DET_NUM * DET_CHANNEL * sizeof(float), cudaMemcpyDeviceToHost, stream));
        checkCudaErrors(cudaStreamSynchronize(stream));

        std::sort(detections_.begin(), detections_.end(),
                [](float9 boxes1, float9 boxes2) { return boxes1.val[10] > boxes2.val[10]; });

        checkCudaErrors(cudaMemcpyAsync(d_detections_, detections_.data() , MAX_DET_NUM * DET_CHANNEL * sizeof(float), cudaMemcpyHostToDevice, stream));
        checkCudaErrors(cudaMemsetAsync(h_mask_, 0, h_mask_size_, stream));

        post_->doPermuteCuda(*h_detections_num_, d_detections_, d_detections_reshape_, stream);
        checkCudaErrors(cudaStreamSynchronize(stream));

        post_->doPostNMSCuda(*h_detections_num_, d_detections_reshape_, h_mask_, stream);
        checkCudaErrors(cudaStreamSynchronize(stream));

        int col_blocks = DIVUP(*h_detections_num_, NMS_THREADS_PER_BLOCK);
        std::vector<uint64_t> remv(col_blocks, 0);
        std::vector<bool> keep(*h_detections_num_, false);
        int max_keep_size = 0;
        for (unsigned int i_nms = 0; i_nms < *h_detections_num_; i_nms++) {
            unsigned int nblock = i_nms / NMS_THREADS_PER_BLOCK;
            unsigned int inblock = i_nms % NMS_THREADS_PER_BLOCK;

            if (!(remv[nblock] & (1ULL << inblock))) {
                keep[i_nms] = true;
                if (max_keep_size++ < params_.nms_post_max_size) {
                    //nms_pred_.push_back(Bndbox(detections_[i_nms].val[0], detections_[i_nms].val[1], detections_[i_nms].val[2],
                    //                    detections_[i_nms].val[3], detections_[i_nms].val[4], detections_[i_nms].val[5],
                    //                    detections_[i_nms].val[6], detections_[i_nms].val[7], detections_[i_nms].val[8],
                    //                    params_.task_num_stride[i_task] + static_cast<int>(detections_[i_nms].val[9]), detections_[i_nms].val[10]));

                    nms_pred_.push_back(Bndbox(detections_[i_nms].val[0], detections_[i_nms].val[1], detections_[i_nms].val[2],
                                        detections_[i_nms].val[3], detections_[i_nms].val[4], detections_[i_nms].val[5],
                                        detections_[i_nms].val[6],
                                        params_.task_num_stride[i_task] + static_cast<int>(detections_[i_nms].val[7]), detections_[i_nms].val[8]));

                }
                uint64_t* p = h_mask_ + i_nms * col_blocks;
                for (int j_nms = nblock; j_nms < col_blocks; j_nms++) {
                    remv[j_nms] |= p[j_nms];
                }
            }
        }
    }
    timing_post_.push_back(timer_.stop("Decode + NMS", verbose_));
    if (verbose_) {
        std::cout << "Detection NUM: " << nms_pred_.size() << std::endl;
        // for(int loop = 0; loop<nms_pred_.size();loop++){
        //     printf("%d, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f\n", loop, nms_pred_[loop].x, nms_pred_[loop].y,nms_pred_[loop].z,nms_pred_[loop].w,nms_pred_[loop].l,nms_pred_[loop].h,nms_pred_[loop].vx,nms_pred_[loop].vy,nms_pred_[loop].rt,nms_pred_[loop].score);
        // }
    }
    return 0;
}

1.4 CMAKE

cmake_minimum_required(VERSION 2.8.7)
project(centerpoint)
set(arch ${CMAKE_HOST_SYSTEM_PROCESSOR})

if(${arch}     STREQUAL  "aarch64")
  # set(TENSORRT_ROOT         /root/.kiwi/lib/TensorRT-8.5.3.1-cuda11x)
  # set(CUDA_TOOLKIT_ROOT_DIR /root/.kiwi/lib/cuda-11.8)
elseif(${arch} STREQUAL "x86_64")
  set(TENSORRT_ROOT         /home/hitbuyi/tensor_rt/TensorRT-8.6.1.6)
  set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda-11.3)
endif()

#set(CMAKE_BUILD_TYPE "Release")
set(CMAKE_BUILD_TYPE "Debug")
set(CMAKE_CXX_FLAGS_RELEASE "-std=c++11 -Wextra -Wall -Wno-deprecated-declarations -O3")

...

make and build the program successfully. I write
save_scn_out(result, "../scn_out_cpp.txt");
in

CenterPoint::doinfer(void* points, unsigned int point_num, cudaStream_t stream){
    ...
    auto result = scn_engine_->forward(
        {valid_num, 4}, spconv::DType::Float16, d_voxel_features,
        {valid_num, 4}, spconv::DType::Int32,   d_voxel_indices,
        1, sparse_shape, stream
    );
    save_scn_out(result, "../scn_out_cpp.txt");
   ...
}

to save scn.onnx's output and found data of output scn.onnx model is zero(data size: 1256200*176)

I run the offical program in my envioroment (ubuntu 20.04+ CUDA11.3+ TensorRT 8.6.1.6) successfully. The differences between my model and offical example are:

Item Official Model My Model
Training Dataset nuScenes KITTI
Speed Input (vx, vy) Yes No
3D Backbone SpMiddleResNetFHD VoxelBackBone8x

pth model's parameters ared loaded into VoxelBackBone8x correctly, I write special python functioni to verified the loading process. I have verified following parts
1, voxelization in pth model and CUDA-centerpoint C++ preprocess module, their results are same
2, scn model before fusion and after fusion, given a same input , the output are same
since onnx does not support sparseconvolution operation, I have no way to verifed scn.onnx model in onnx runtime enviroment

scn.onnx inference here has strong relationd with libspconv. what is the problem about ny scn.onnx inferences ? why output of tensor with shape of 1256200*176 are all zeros?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions