-
Notifications
You must be signed in to change notification settings - Fork 296
Description
I trained centerpoint model on KITTI dataset, 3D backone is VoxelBackBone8x (not SpMiddleResNetFHD), using framework of this repository to export scn.onnx successfully, pth models of scn before operation fusion and after fusion are consistent. the fused scn.onnx is as below

both VoxelBackBone8x and SpMiddleResNetFHD are based on Sparseconvolution
- scn.onnx exporting enviroment
spconv-cu113 2.3.6
protobuf 4.25.3
torch 2.2.2+cu118
onnx 1.18.0
onnxruntime 1.20.0
1, in CUDA-centerpoint, I change source code to complie with KITTI dataset. delete speed related code since KITTI dataset has no speed labels, change point cloud range,etc
1.1 commmon.h is changed as below
const unsigned int MAX_DET_NUM = 1000;
//const unsigned int DET_CHANNEL = 11;
const unsigned int DET_CHANNEL = 9; // KITTI dataset ,delete vx,vy
const unsigned int MAX_POINTS_NUM = 300000;
const unsigned int NUM_TASKS = 6;
#define checkCudaErrors(op) \
{ \
auto status = ((op)); \
if (status != 0) { \
std::cout << "Cuda failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \
<< ":" << __LINE__ << " error status: " << status << std::endl; \
abort(); \
} \
}
class Params
{
public:
//const unsigned int task_num_stride[NUM_TASKS] = { 0, 1, 3, 5, 6, 8, };
const unsigned int task_num_stride[NUM_TASKS] = { 0, 1,}; // KITTI centerpoint
static const unsigned int num_classes = 3;
//const char *class_name[num_classes] = { "car", "truck", "construction_vehicle", "bus", "trailer", "barrier", "motorcycle", "bicycle", "pedestrian", "traffic_cone"};
const char *class_name[num_classes] = { "Car", "Pedestrian", "Cyclist"};
const float out_size_factor = 4;
const float voxel_size[2] = { 0.05, 0.05, };
const float pc_range[2] = { 0, 40, };
const float score_threshold = 0.1;
//const float post_center_range[6] = { -61.2, -61.2, -10.0, 61.2, 61.2, 10.0, };
const float post_center_range[6] = { 0.0f, -40.0f, -3.0f, 70.4f, 40.0f, 1.0f }; // KITTI
const float nms_iou_threshold = 0.2;
const unsigned int nms_pre_max_size = MAX_DET_NUM;
const unsigned int nms_post_max_size= 83;
const float min_x_range = 0;
const float max_x_range = 70.4;
const float min_y_range = -40.0;
const float max_y_range = 40.0;
const float min_z_range = -3.0;
const float max_z_range = 1.0;
//the size of a pillar
const float pillar_x_size = 0.05;
const float pillar_y_size = 0.05;
const float pillar_z_size = 0.1;
const int max_points_per_voxel = 10;
const unsigned int max_voxels = 160000;
//const unsigned int feature_num = 5;
const unsigned int feature_num = 4;
Params() {};
int getGridXSize() {
return (int)std::round((max_x_range - min_x_range) / pillar_x_size);
}
int getGridYSize() {
return (int)std::round((max_y_range - min_y_range) / pillar_y_size);
}
int getGridZSize() {
return (int)std::round((max_z_range - min_z_range) / pillar_z_size);
}
};
1.2 centerpoint.h is changed as below
//typedef struct float11 { float val[11]; } float11;
typedef struct float11 { float val[9]; } float9; // use float9
1.3 centerpoint.cpp is changed as below
extern void save_scn_out(spconv::DTensor* result, const std::string& path);
template<typename T>
double getAverage(std::vector<T> const& v) {
if (v.empty()) {
return 0;
}
return std::accumulate(v.begin(), v.end(), 0.0) / v.size();
}
CenterPoint::CenterPoint(std::string modelFile, bool verbose): verbose_(verbose)
{
trt_ = TensorRT::load(modelFile);
if(trt_ == nullptr) abort();
pre_.reset(new PreProcessCuda());
post_.reset(new PostProcessCuda());
//scn_engine_ = spconv::load_engine_from_onnx("../model/centerpoint.scn.onnx");
scn_engine_ = spconv::load_engine_from_onnx("../model/centerpoint_scn.onnx");
checkCudaErrors(cudaMallocHost((void **)&h_detections_num_, sizeof(unsigned int)));
checkCudaErrors(cudaMemset(h_detections_num_, 0, sizeof(unsigned int)));
checkCudaErrors(cudaMalloc((void **)&d_detections_, MAX_DET_NUM * DET_CHANNEL * sizeof(float)));
checkCudaErrors(cudaMemset(d_detections_, 0, MAX_DET_NUM * DET_CHANNEL * sizeof(float)));
//add d_detections_reshape_
checkCudaErrors(cudaMalloc((void **)&d_detections_reshape_, MAX_DET_NUM * DET_CHANNEL * sizeof(float)));
checkCudaErrors(cudaMemset(d_detections_reshape_, 0, MAX_DET_NUM * DET_CHANNEL * sizeof(float)));
//detections_.resize(MAX_DET_NUM, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
detections_.resize(MAX_DET_NUM, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); // KITTI, use 9 data
for(unsigned int i=0; i < NUM_TASKS; i++) {
checkCudaErrors(cudaMalloc((void **)&d_reg_[i], trt_->getBindingNumel("reg_" + std::to_string(i)) * sizeof(half)));
checkCudaErrors(cudaMalloc((void **)&d_height_[i], trt_->getBindingNumel("height_" + std::to_string(i)) * sizeof(half)));
checkCudaErrors(cudaMalloc((void **)&d_dim_[i], trt_->getBindingNumel("dim_" + std::to_string(i)) * sizeof(half)));
checkCudaErrors(cudaMalloc((void **)&d_rot_[i], trt_->getBindingNumel("rot_" + std::to_string(i)) * sizeof(half)));
//checkCudaErrors(cudaMalloc((void **)&d_vel_[i], trt_->getBindingNumel("vel_" + std::to_string(i)) * sizeof(half)));
checkCudaErrors(cudaMalloc((void **)&d_hm_[i], trt_->getBindingNumel("hm_" + std::to_string(i)) * sizeof(half)));
if(i==0){
auto d = trt_->getBindingDims("reg_" + std::to_string(i));
reg_n_ = d[0];
reg_c_ = d[1];
reg_h_ = d[2];
reg_w_ = d[3];
d = trt_->getBindingDims("height_" + std::to_string(i));
height_c_ = d[1];
d = trt_->getBindingDims("dim_" + std::to_string(i));
dim_c_ = d[1];
d = trt_->getBindingDims("rot_" + std::to_string(i));
rot_c_ = d[1];
//d = trt_->getBindingDims("vel_" + std::to_string(i));
//vel_c_ = d[1];
}
auto d = trt_->getBindingDims("hm_" + std::to_string(i));
hm_c_[i] = d[1];
}
h_mask_size_ = params_.nms_pre_max_size * DIVUP(params_.nms_pre_max_size, NMS_THREADS_PER_BLOCK) * sizeof(uint64_t);
checkCudaErrors(cudaMallocHost((void **)&h_mask_, h_mask_size_));
checkCudaErrors(cudaMemset(h_mask_, 0, h_mask_size_));
return;
}
CenterPoint::~CenterPoint(void)
{
pre_.reset();
trt_.reset();
post_.reset();
scn_engine_.reset();
checkCudaErrors(cudaFreeHost(h_detections_num_));
checkCudaErrors(cudaFree(d_detections_));
checkCudaErrors(cudaFree(d_detections_reshape_));
for (unsigned int i=0; i < NUM_TASKS; i++) {
checkCudaErrors(cudaFree(d_reg_[i]));
checkCudaErrors(cudaFree(d_height_[i]));
checkCudaErrors(cudaFree(d_dim_[i]));
checkCudaErrors(cudaFree(d_rot_[i]));
//checkCudaErrors(cudaFree(d_vel_[i]));
checkCudaErrors(cudaFree(d_hm_[i]));
}
checkCudaErrors(cudaFreeHost(h_mask_));
return;
}
int CenterPoint::prepare(){
pre_->alloc_resource();
return 0;
}
int CenterPoint::doinfer(void* points, unsigned int point_num, cudaStream_t stream)
{
float elapsedTime = 0.0f;
timer_.start(stream);
pre_->generateVoxels((float *)points, point_num, stream);
timing_pre_.push_back(timer_.stop("Voxelization", verbose_));
unsigned int valid_num = pre_->getOutput(&d_voxel_features, &d_voxel_indices, sparse_shape);
if (verbose_) {
std::cout << "valid_num: " << valid_num <<std::endl;
}
timer_.start(stream);
auto result = scn_engine_->forward(
{valid_num, 4}, spconv::DType::Float16, d_voxel_features,
{valid_num, 4}, spconv::DType::Int32, d_voxel_indices,
1, sparse_shape, stream
);
save_scn_out(result, "../scn_out_cpp.txt");
timing_scn_engine_.push_back(timer_.stop("3D Backbone", verbose_));
timer_.start(stream);
//trt_->forward({result->features_data(), d_reg_[0], d_height_[0], d_dim_[0], d_rot_[0], d_vel_[0], d_hm_[0],
// d_reg_[1], d_height_[1], d_dim_[1], d_rot_[1], d_vel_[1], d_hm_[1],
// d_reg_[2], d_height_[2], d_dim_[2], d_rot_[2], d_vel_[2], d_hm_[2],
// d_reg_[3], d_height_[3], d_dim_[3], d_rot_[3], d_vel_[3], d_hm_[3],
// d_reg_[4], d_height_[4], d_dim_[4], d_rot_[4], d_vel_[4], d_hm_[4],
// d_reg_[5], d_height_[5], d_dim_[5], d_rot_[5], d_vel_[5], d_hm_[5]}, stream);
trt_->forward({result->features_data(), d_reg_[0], d_height_[0], d_dim_[0], d_rot_[0], d_vel_[0], d_hm_[0],
d_reg_[1], d_height_[1], d_dim_[1], d_rot_[1], d_vel_[1], d_hm_[1]}, stream);
timing_trt_.push_back(timer_.stop("RPN + Head", verbose_));
nms_pred_.clear();
timer_.start(stream);
for(unsigned int i_task =0; i_task < NUM_TASKS; i_task++) {
checkCudaErrors(cudaMemset(h_detections_num_, 0, sizeof(unsigned int)));
checkCudaErrors(cudaMemset(d_detections_, 0, MAX_DET_NUM * DET_CHANNEL * sizeof(float)));
checkCudaErrors(cudaMemset(d_detections_reshape_, 0, MAX_DET_NUM * DET_CHANNEL * sizeof(float)));
d_vel_[i_task] = nullptr;
vel_c_ = 0;
post_->doPostDecodeCuda(reg_n_, reg_h_, reg_w_, reg_c_, height_c_, dim_c_, rot_c_, vel_c_, hm_c_[i_task],
d_reg_[i_task],
d_height_[i_task],
d_dim_[i_task],
d_rot_[i_task],
d_vel_[i_task],
d_hm_[i_task],
h_detections_num_,
d_detections_, stream);
if(*h_detections_num_ == 0) continue;
checkCudaErrors(cudaMemcpyAsync(detections_.data(), d_detections_, MAX_DET_NUM * DET_CHANNEL * sizeof(float), cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
std::sort(detections_.begin(), detections_.end(),
[](float9 boxes1, float9 boxes2) { return boxes1.val[10] > boxes2.val[10]; });
checkCudaErrors(cudaMemcpyAsync(d_detections_, detections_.data() , MAX_DET_NUM * DET_CHANNEL * sizeof(float), cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemsetAsync(h_mask_, 0, h_mask_size_, stream));
post_->doPermuteCuda(*h_detections_num_, d_detections_, d_detections_reshape_, stream);
checkCudaErrors(cudaStreamSynchronize(stream));
post_->doPostNMSCuda(*h_detections_num_, d_detections_reshape_, h_mask_, stream);
checkCudaErrors(cudaStreamSynchronize(stream));
int col_blocks = DIVUP(*h_detections_num_, NMS_THREADS_PER_BLOCK);
std::vector<uint64_t> remv(col_blocks, 0);
std::vector<bool> keep(*h_detections_num_, false);
int max_keep_size = 0;
for (unsigned int i_nms = 0; i_nms < *h_detections_num_; i_nms++) {
unsigned int nblock = i_nms / NMS_THREADS_PER_BLOCK;
unsigned int inblock = i_nms % NMS_THREADS_PER_BLOCK;
if (!(remv[nblock] & (1ULL << inblock))) {
keep[i_nms] = true;
if (max_keep_size++ < params_.nms_post_max_size) {
//nms_pred_.push_back(Bndbox(detections_[i_nms].val[0], detections_[i_nms].val[1], detections_[i_nms].val[2],
// detections_[i_nms].val[3], detections_[i_nms].val[4], detections_[i_nms].val[5],
// detections_[i_nms].val[6], detections_[i_nms].val[7], detections_[i_nms].val[8],
// params_.task_num_stride[i_task] + static_cast<int>(detections_[i_nms].val[9]), detections_[i_nms].val[10]));
nms_pred_.push_back(Bndbox(detections_[i_nms].val[0], detections_[i_nms].val[1], detections_[i_nms].val[2],
detections_[i_nms].val[3], detections_[i_nms].val[4], detections_[i_nms].val[5],
detections_[i_nms].val[6],
params_.task_num_stride[i_task] + static_cast<int>(detections_[i_nms].val[7]), detections_[i_nms].val[8]));
}
uint64_t* p = h_mask_ + i_nms * col_blocks;
for (int j_nms = nblock; j_nms < col_blocks; j_nms++) {
remv[j_nms] |= p[j_nms];
}
}
}
}
timing_post_.push_back(timer_.stop("Decode + NMS", verbose_));
if (verbose_) {
std::cout << "Detection NUM: " << nms_pred_.size() << std::endl;
// for(int loop = 0; loop<nms_pred_.size();loop++){
// printf("%d, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f\n", loop, nms_pred_[loop].x, nms_pred_[loop].y,nms_pred_[loop].z,nms_pred_[loop].w,nms_pred_[loop].l,nms_pred_[loop].h,nms_pred_[loop].vx,nms_pred_[loop].vy,nms_pred_[loop].rt,nms_pred_[loop].score);
// }
}
return 0;
}
1.4 CMAKE
cmake_minimum_required(VERSION 2.8.7)
project(centerpoint)
set(arch ${CMAKE_HOST_SYSTEM_PROCESSOR})
if(${arch} STREQUAL "aarch64")
# set(TENSORRT_ROOT /root/.kiwi/lib/TensorRT-8.5.3.1-cuda11x)
# set(CUDA_TOOLKIT_ROOT_DIR /root/.kiwi/lib/cuda-11.8)
elseif(${arch} STREQUAL "x86_64")
set(TENSORRT_ROOT /home/hitbuyi/tensor_rt/TensorRT-8.6.1.6)
set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda-11.3)
endif()
#set(CMAKE_BUILD_TYPE "Release")
set(CMAKE_BUILD_TYPE "Debug")
set(CMAKE_CXX_FLAGS_RELEASE "-std=c++11 -Wextra -Wall -Wno-deprecated-declarations -O3")
...
make and build the program successfully. I write
save_scn_out(result, "../scn_out_cpp.txt");
in
CenterPoint::doinfer(void* points, unsigned int point_num, cudaStream_t stream){
...
auto result = scn_engine_->forward(
{valid_num, 4}, spconv::DType::Float16, d_voxel_features,
{valid_num, 4}, spconv::DType::Int32, d_voxel_indices,
1, sparse_shape, stream
);
save_scn_out(result, "../scn_out_cpp.txt");
...
}
to save scn.onnx's output and found data of output scn.onnx model is zero(data size: 1256200*176)
I run the offical program in my envioroment (ubuntu 20.04+ CUDA11.3+ TensorRT 8.6.1.6) successfully. The differences between my model and offical example are:
Item | Official Model | My Model |
---|---|---|
Training Dataset | nuScenes | KITTI |
Speed Input (vx, vy) | Yes | No |
3D Backbone | SpMiddleResNetFHD | VoxelBackBone8x |
pth model's parameters ared loaded into VoxelBackBone8x correctly, I write special python functioni to verified the loading process. I have verified following parts
1, voxelization in pth model and CUDA-centerpoint C++ preprocess module, their results are same
2, scn model before fusion and after fusion, given a same input , the output are same
since onnx does not support sparseconvolution operation, I have no way to verifed scn.onnx model in onnx runtime enviroment
scn.onnx inference here has strong relationd with libspconv. what is the problem about ny scn.onnx inferences ? why output of tensor with shape of 1256200*176 are all zeros?