Skip to content

Commit

Permalink
Group parallel ray casts into tiles to reduce job dispatch overhead
Browse files Browse the repository at this point in the history
  • Loading branch information
victorreijgwart committed Oct 28, 2024
1 parent fea5914 commit 97525ec
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 37 deletions.
10 changes: 10 additions & 0 deletions library/cpp/include/wavemap/core/utils/math/int_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,16 @@ constexpr T div_round_up(T numerator, T denominator) {
return numerator / denominator + (numerator % denominator != 0);
}

template <int dim>
Eigen::Matrix<int, dim, 1> div_round_up(Eigen::Matrix<int, dim, 1> vector,
int denominator) {
DCHECK_GE(denominator, 0);
for (int dim_idx = 0; dim_idx < dim; ++dim_idx) {
vector[dim_idx] = div_round_up(vector[dim_idx], denominator);
}
return vector;
}

template <typename T>
constexpr T factorial(T x) {
T result = 1;
Expand Down
71 changes: 34 additions & 37 deletions library/python/src/raycast.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ using namespace nb::literals; // NOLINT
namespace wavemap {
FloatingPoint raycast(const HashedWaveletOctree& map,
const Point3D& start_point, const Point3D& end_point,
FloatingPoint threshold) {
FloatingPoint log_odds_threshold) {
const FloatingPoint mcw = map.getMinCellWidth();
const Ray ray(start_point, end_point, mcw);
for (const Index3D& ray_voxel_index : ray) {
if (map.getCellValue(ray_voxel_index) > threshold) {
if (log_odds_threshold < map.getCellValue(ray_voxel_index)) {
const Point3D voxel_center =
convert::indexToCenterPoint(ray_voxel_index, mcw);
return (voxel_center - start_point).norm();
Expand All @@ -32,10 +32,11 @@ FloatingPoint raycast(const HashedWaveletOctree& map,
FloatingPoint raycast_fast(
QueryAccelerator<HashedWaveletOctree>& query_accelerator,
const Point3D& start_point, const Point3D& end_point,
FloatingPoint threshold, FloatingPoint min_cell_width) {
FloatingPoint log_odds_threshold) {
const FloatingPoint min_cell_width = query_accelerator.getMinCellWidth();
const Ray ray(start_point, end_point, min_cell_width);
for (const Index3D& ray_voxel_index : ray) {
if (query_accelerator.getCellValue(ray_voxel_index) > threshold) {
if (log_odds_threshold < query_accelerator.getCellValue(ray_voxel_index)) {
const Point3D voxel_center =
convert::indexToCenterPoint(ray_voxel_index, min_cell_width);
return (voxel_center - start_point).norm();
Expand Down Expand Up @@ -75,44 +76,40 @@ void add_raycast_bindings(nb::module_& m) {
m.def(
"get_depth",
[](const HashedWaveletOctree& map, const Transformation3D& pose,
const PinholeCameraProjectorConfig& cam_cfg, FloatingPoint threshold,
FloatingPoint max_range) {
// NOTE: This way of parallelizing it is not very efficient, as it
// creates a very large number of jobs (1 per pixel), but already
// leads to a nice speedup. The next step to improve it would
// probably be to split the image into tiles and spawn 1 job per
// tile. The tile size should be such that there are enough tiles
// to distribute the work evenly across all cores even if some
// tiles take much shorter than others, while still being few
// enough to minimize the overhead of dispatching jobs and create
// local QueryAccelerator instances for each job. Maybe 10x as
// many tiles as there are workers?
ThreadPool thread_pool; // By default, the pool will spawn as many
// workers as the system's reported
// std::thread::hardware_concurrency().
Image depth_image(cam_cfg.width, cam_cfg.height);
const FloatingPoint min_cell_width = map.getMinCellWidth();
const PinholeCameraProjectorConfig& cam_cfg,
FloatingPoint log_odds_threshold, FloatingPoint max_range) {
const PinholeCameraProjector projection_model(cam_cfg);
const Point3D& start_point = pose.getPosition();
for (const Index2D& index :
Grid<2>(Index2D::Zero(),
depth_image.getDimensions() - Index2D::Ones())) {
FloatingPoint& depth_pixel = depth_image.at(index);
thread_pool.add_task([&map, &projection_model, &pose, &start_point,
&depth_pixel, index, max_range, threshold,
min_cell_width]() {
Image depth_image(projection_model.getDimensions());
const Point3D& W_start_point = pose.getPosition();

ThreadPool thread_pool;
constexpr int kPatchWidth = 64;
const Index2D num_patches =
int_math::div_round_up(depth_image.getDimensions(), kPatchWidth);
for (const Index2D& patch_index :
Grid<2>(Index2D::Zero(), num_patches - Index2D::Ones())) {
thread_pool.add_task([&map, &projection_model, &pose, &W_start_point,
&depth_image, patch_index, kPatchWidth,
max_range, log_odds_threshold]() {
QueryAccelerator query_accelerator(map);
const Vector2D image_xy = projection_model.indexToImage(index);
const Point3D C_point =
projection_model.sensorToCartesian({image_xy, max_range});
const Point3D end_point = pose * C_point;
FloatingPoint depth =
raycast_fast(query_accelerator, start_point, end_point,
threshold, min_cell_width);
depth_pixel = depth;
const Index2D patch_min = patch_index * kPatchWidth;
const Index2D patch_max =
(patch_min.array() + kPatchWidth)
.min(depth_image.getDimensions().array()) -
1;
for (const Index2D& index : Grid<2>(patch_min, patch_max)) {
FloatingPoint& depth_pixel = depth_image.at(index);
const Vector2D image_xy = projection_model.indexToImage(index);
const Point3D C_end_point =
projection_model.sensorToCartesian({image_xy, max_range});
const Point3D W_end_point = pose * C_end_point;
depth_pixel = raycast_fast(query_accelerator, W_start_point,
W_end_point, log_odds_threshold);
}
});
}
thread_pool.wait_all();

return depth_image.getData().transpose().eval();
},
"Extract depth from octree map at using given camera pose and "
Expand Down

0 comments on commit 97525ec

Please sign in to comment.