open-mmlab · momo609 · May 13, 2024
diff --git a/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp b/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp
@@ -5,19 +5,34 @@ using namespace std;
 
 void chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1,
                                   Tensor dist2, Tensor idx1, Tensor idx2) {
+  bool is_half = XYZ1.scalar_type() == at::kHalf;
   at::Tensor xyz1 = at::ones_like(XYZ1);
   at::Tensor xyz2 = at::ones_like(XYZ2);
+  at::Tensor distf1 = at::ones_like(dist1);
+  at::Tensor distf2 = at::ones_like(dist2);
   xyz1 = XYZ1.transpose(1, 2).transpose(0, 1);
   xyz2 = XYZ2.transpose(1, 2).transpose(0, 1);
+  if (is_half) {
+    xyz1 = xyz1.to(at::kFloat);
+    xyz2 = xyz2.to(at::kFloat);
+    distf1 = dist1.to(at::kFloat);
+    distf2 = dist2.to(at::kFloat);
+  }
   OpCommand cmd;
   cmd.Name("ChamferDistance")
       .Input(xyz1)
       .Input(xyz2)
-      .Output(dist1)
-      .Output(dist2)
+      .Output(distf1)
+      .Output(distf2)
       .Output(idx1)
       .Output(idx2)
       .Run();
+  if (is_half) {
+    distf1 = distf1.to(at::kHalf);
+    distf2 = distf2.to(at::kHalf);
+  }
+  dist1.copy_(distf1);
+  dist2.copy_(distf2);
 }
 
 void chamfer_distance_backward_npu(Tensor xyz1, Tensor xyz2, Tensor idx1,

diff --git a/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp b/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp
@@ -4,6 +4,21 @@ using namespace std;
 
 void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
+  at::Tensor input_y = input;
+  at::Tensor output_y = output;
+  bool is_half = input.scalar_type() == at::kHalf;
+  if (is_half) {
+    input_y = input.to(at::kFloat);
+    output_y = output.to(at::kFloat);
+  }
+  int64_t weight_size = weight.size(0);
+  at::Tensor weight_y = at::ones_like(input_y);
+  if (weight_size > 0) {
+    weight_y = at::broadcast_to(weight, input.sizes());
+    if (is_half) {
+      weight_y = weight_y.to(at::kFloat);
+    }
+  }
   int64_t n_class = input.size(1);
   at::Tensor target_y = at::ones_like(input);
   if (n_class == 1) {
@@ -12,24 +27,26 @@ void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
     target_y = at::add(target_y, 1.0);
   } else {
     target_y = at::one_hot(target, n_class);
+    weight_y = at::mul(weight_y, target_y);
+    weight_y = at::sum(weight_y, 1, true);
+    weight_y = at::broadcast_to(weight_y, input.sizes());
   }
   target_y = target_y.to(at::kInt);
-  int64_t weight_size = weight.size(0);
-  at::Tensor weight_y = at::ones_like(input);
-  if (weight_size > 0) {
-    weight_y = at::broadcast_to(weight, input.sizes());
-  }
   OpCommand cmd;
   string reduction = "none";
   cmd.Name("SigmoidFocalLoss")
-      .Input(input)
+      .Input(input_y)
       .Input(target_y)
       .Input(weight_y)
-      .Output(output)
+      .Output(output_y)
       .Attr("gamma", gamma)
       .Attr("alpha", alpha)
       .Attr("reduction", reduction)
       .Run();
+  if (is_half) {
+    output_y = output_y.to(at::kHalf);
+  }
+  output.copy_(output_y);
 }
 
 void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
@@ -38,34 +55,51 @@ void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
 void sigmoid_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
                                      Tensor grad_input, float gamma,
                                      float alpha) {
+  at::Tensor input_y = input;
+  at::Tensor grad_input_y = grad_input;
+  bool is_half = input.scalar_type() == at::kHalf;
+  if (is_half) {
+    input_y = input.to(at::kFloat);
+    grad_input_y = grad_input.to(at::kFloat);
+  }
+  int64_t weight_size = weight.size(0);
+  at::Tensor weight_y = at::ones_like(input_y);
+  if (weight_size > 0) {
+    weight_y = at::broadcast_to(weight, input.sizes());
+    if (is_half) {
+      weight_y = weight_y.to(at::kFloat);
+    }
+  }
   int64_t n_class = input.size(1);
   at::Tensor target_y = at::ones_like(input);
   if (n_class == 1) {
     target_y = at::reshape(target, input.sizes());
   } else {
     target_y = at::one_hot(target, n_class);
+    weight_y = at::mul(weight_y, target_y);
+    weight_y = at::sum(weight_y, 1, true);
+    weight_y = at::broadcast_to(weight_y, input.sizes());
     target_y = at::mul(target_y, -1.0);
     target_y = at::add(target_y, 1.0);
   }
   target_y = target_y.to(at::kInt);
   at::Tensor grad_up = at::ones_like(input);
-  int64_t weight_size = weight.size(0);
-  at::Tensor weight_y = at::ones_like(input);
-  if (weight_size > 0) {
-    weight_y = at::broadcast_to(weight, input.sizes());
-  }
   OpCommand cmd;
   string reduction = "none";
   cmd.Name("SigmoidFocalLossGrad")
-      .Input(input)
+      .Input(input_y)
       .Input(target_y)
       .Input(grad_up)
       .Input(weight_y)
-      .Output(grad_input)
+      .Output(grad_input_y)
       .Attr("gamma", gamma)
       .Attr("alpha", alpha)
       .Attr("reduction", reduction)
       .Run();
+  if (is_half) {
+    grad_input_y = grad_input_y.to(at::kHalf);
+  }
+  grad_input.copy_(grad_input_y);
 }
 
 void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
@@ -74,26 +108,40 @@ void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
 
 void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
                                     Tensor output, float gamma, float alpha) {
+  at::Tensor input_y = input;
+  bool is_half = input.scalar_type() == at::kHalf;
+  if (is_half) {
+    input_y = input.to(at::kFloat);
+  }
   int64_t n_class = input.size(1);
   at::Tensor target_y = at::one_hot(target, n_class);
   target_y = target_y.to(at::kInt);
   int64_t weight_size = weight.size(0);
-  at::Tensor weight_y = at::ones_like(input);
+  at::Tensor weight_y = at::ones_like(input_y);
   if (weight_size > 0) {
     weight_y = at::broadcast_to(weight, input.sizes());
+    if (is_half) {
+      weight_y = weight_y.to(at::kFloat);
+    }
+    weight_y = at::mul(weight_y, target_y);
+    weight_y = at::sum(weight_y, 1, true);
+    weight_y = at::broadcast_to(weight_y, input.sizes());
   }
-  at::Tensor op_output = at::ones_like(input);
+  at::Tensor op_output = at::ones_like(input_y);
   OpCommand cmd;
   string reduction = "none";
   cmd.Name("SoftmaxFocalLoss")
-      .Input(input)
+      .Input(input_y)
       .Input(target_y)
       .Input(weight_y)
       .Output(op_output)
       .Attr("gamma", gamma)
       .Attr("alpha", alpha)
       .Attr("reduction", reduction)
       .Run();
+  if (is_half) {
+    op_output = op_output.to(at::kHalf);
+  }
   int64_t n_batch = input.size(0);
   c10::SmallVector<int64_t, 2> offsets = {0, 0};
   c10::SmallVector<int64_t, 2> sizes = {n_batch, 1};
@@ -124,27 +172,44 @@ void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
 void softmax_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
                                      Tensor buff, Tensor grad_input,
                                      float gamma, float alpha) {
+  at::Tensor input_y = input;
+  at::Tensor grad_input_y = grad_input;
+  bool is_half = input.scalar_type() == at::kHalf;
+  if (is_half) {
+    input_y = input.to(at::kFloat);
+    grad_input_y = grad_input.to(at::kFloat);
+  }
   int64_t n_class = input.size(1);
   at::Tensor target_y = at::one_hot(target, n_class);
   target_y = target_y.to(at::kInt);
   at::Tensor grad_up = at::ones_like(input);
   int64_t weight_size = weight.size(0);
-  at::Tensor weight_y = at::ones_like(input);
+  at::Tensor weight_y = at::ones_like(input_y);
   if (weight_size > 0) {
     weight_y = at::broadcast_to(weight, input.sizes());
+    if (is_half) {
+      weight_y = weight_y.to(at::kFloat);
+    }
+    weight_y = at::mul(weight_y, target_y);
+    weight_y = at::sum(weight_y, 1, true);
+    weight_y = at::broadcast_to(weight_y, input.sizes());
   }
   OpCommand cmd;
   string reduction = "none";
   cmd.Name("SoftmaxFocalLossGrad")
-      .Input(input)
+      .Input(input_y)
       .Input(target_y)
       .Input(grad_up)
       .Input(weight_y)
-      .Output(grad_input)
+      .Output(grad_input_y)
       .Attr("gamma", gamma)
       .Attr("alpha", alpha)
       .Attr("reduction", reduction)
       .Run();
+  if (is_half) {
+    grad_input_y = grad_input_y.to(at::kHalf);
+  }
+  grad_input.copy_(grad_input_y);
 }
 
 void softmax_focal_loss_backward_impl(Tensor input, Tensor target,

diff --git a/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp b/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
@@ -24,6 +24,12 @@ void gather_points_forward_npu(int b, int c, int n, int npoints,
 void gather_points_backward_npu(int b, int c, int n, int npoints,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points) {
+  at::Tensor grad_out_cast = grad_out;
+  at::Tensor grad_points_cast = grad_points;
+  if (grad_out.scalar_type() == at::ScalarType::Half) {
+    grad_out_cast = grad_out.to(at::kFloat);
+    grad_points_cast = grad_points.to(at::kFloat);
+  }
   at::Tensor indices = idx;
   if (idx.scalar_type() != at::ScalarType::Int) {
     indices = idx.to(at::kInt);
@@ -37,11 +43,11 @@ void gather_points_backward_npu(int b, int c, int n, int npoints,
   for (uint64_t i = 0; i < shape.size(); i++) {
     pad_size.emplace_back(shape[i]);
   }
-  at::Tensor trans_grad_points = grad_points.transpose(1, 2).contiguous();
+  at::Tensor trans_grad_points = grad_points_cast.transpose(1, 2).contiguous();
   at::Tensor grad_points_view = trans_grad_points.view(
       {trans_grad_points.sizes()[0] * trans_grad_points.sizes()[1],
        trans_grad_points.sizes()[2]});
-  at::Tensor trans_grad_out = grad_out.transpose(1, 2).contiguous();
+  at::Tensor trans_grad_out = grad_out_cast.transpose(1, 2).contiguous();
   trans_grad_out = trans_grad_out.view(
       {trans_grad_out.sizes()[0] * trans_grad_out.sizes()[1],
        trans_grad_out.sizes()[2]});
@@ -63,7 +69,11 @@ void gather_points_backward_npu(int b, int c, int n, int npoints,
   at::Tensor grad_points_result =
       grad_points_view.view(trans_grad_points.sizes());
   grad_points_result = grad_points_result.transpose(1, 2);
-  grad_points.copy_(grad_points_result);
+  at::Tensor grad_points_result_cast = grad_points_result;
+  if (grad_out.scalar_type() == at::ScalarType::Half) {
+    grad_points_result_cast = grad_points_result.to(at::kHalf);
+  }
+  grad_points.copy_(grad_points_result_cast);
 }
 
 void gather_points_forward_impl(int b, int c, int n, int npoints,

diff --git a/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp b/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp
@@ -8,11 +8,11 @@ using namespace std;
 void knn_forward_npu(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
   // transpose known from [B, N, 3] to [B, 3, N]
-  at::Tensor source = xyz.transpose(1, 2).contiguous();
+  at::Tensor source = xyz.transpose(2, 1).contiguous();
   at::Tensor target = new_xyz.contiguous();
 
   bool is_from_knn = true;
-  EXEC_NPU_CMD(aclnnKnn, source, target, nsample, is_from_knn, idx, dist2);
+  EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, dist2);
 }
 
 void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,

diff --git a/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp b/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
@@ -50,23 +50,29 @@ void roi_pool_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax,
   int64_t pooled_height_64 = pooled_height;
   int64_t pooled_width_64 = pooled_width;
   int64_t pooled_channel = 1;
+  at::Tensor argmax_trans = argmax.transpose(1, 2).transpose(2, 3);
+  at::Tensor grad_output_trans = grad_output.transpose(1, 2).transpose(2, 3);
   at::Tensor roi_actual_num =
       at::empty_like(rois, rois.options().dtype(at::kInt));
-  at::Tensor x = at::ones_like(grad_input);
+  at::Tensor x = at::ones_like(grad_input).transpose(1, 2).transpose(2, 3);
+  at::Tensor y = at::zeros_like(x);
   OpCommand cmd;
   cmd.Name("RoiPoolingGradWithArgMax")
-      .Input(grad_output)
+      .Input(grad_output_trans)
       .Input(x)
       .Input(rois)
       .Input(roi_actual_num)
-      .Input(argmax)
-      .Output(grad_input)
+      .Input(argmax_trans)
+      .Output(y)
       .Attr("pooled_h", pooled_height_64)
       .Attr("pooled_w", pooled_width_64)
       .Attr("spatial_scale_h", spatial_scale)
       .Attr("spatial_scale_w", spatial_scale)
       .Attr("pool_channel", pooled_channel)
       .Run();
+  at::Tensor result = y.transpose(2, 3).transpose(1, 2);
+  at::Tensor res = result.contiguous();
+  grad_input.copy_(res);
 }
 
 void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,

diff --git a/mmcv/ops/csrc/pytorch/npu/stack_ball_query_npu.cpp b/mmcv/ops/csrc/pytorch/npu/stack_ball_query_npu.cpp
@@ -8,9 +8,10 @@ void stack_ball_query_forward_npu(float max_radius, int nsample,
                                   const Tensor new_xyz_batch_cnt,
                                   const Tensor xyz, const Tensor xyz_batch_cnt,
                                   Tensor idx) {
-  at::Tensor xyz_transpose = xyz.transpose(0, 1).contiguous();
+  at::Tensor xyz_transpose = xyz.transpose(0, 1).contiguous().to(at::kFloat);
+  at::Tensor new_xyz_fp32 = new_xyz.to(at::kFloat);
   double max_radius_double = double(max_radius);
-  EXEC_NPU_CMD(aclnnStackBallQuery, xyz_transpose, new_xyz, xyz_batch_cnt,
+  EXEC_NPU_CMD(aclnnStackBallQuery, xyz_transpose, new_xyz_fp32, xyz_batch_cnt,
                new_xyz_batch_cnt, max_radius_double, nsample, idx);
 }
 

diff --git a/mmcv/ops/csrc/pytorch/npu/three_interpolate_npu.cpp b/mmcv/ops/csrc/pytorch/npu/three_interpolate_npu.cpp
@@ -12,17 +12,21 @@ void three_interpolate_forward_npu(int b, int c, int m, int n,
   TORCH_CHECK((originDtype == at::kFloat || originDtype == at::kHalf),
               "three_interpolate_forward ascend only support fp32 and fp16.");
 
-  auto point_c_trans = points.transpose(1, 2);
-
+  auto point_c_trans = points.transpose(1, 2).to(at::kFloat);
+  auto weight_cast = weight.to(at::kFloat);
+  auto out_cast = out.to(at::kFloat);
   OpCommand cmd;
   cmd.Name("ThreeInterpolate")
       .Input(point_c_trans)
       .Input(idx)
-      .Input(weight)
-      .Output(out)
+      .Input(weight_cast)
+      .Output(out_cast)
       .Run();
 
-  auto output = out.view({b, n, c}).transpose(1, 2);
+  if (originDtype == at::kHalf) {
+    out_cast = out_cast.to(at::kHalf);
+  }
+  auto output = out_cast.view({b, n, c}).transpose(1, 2);
   auto res = output.contiguous();
   out.copy_(res);
 }
@@ -34,12 +38,17 @@ void three_interpolate_backward_npu(int b, int c, int n, int m,
   TORCH_CHECK((originDtype == at::kFloat || originDtype == at::kHalf),
               "three_interpolate_backward ascend only support fp32 and fp16.");
 
-  auto grad_x = at::unsqueeze(grad_out, 3);
-  auto grad_y = at::unsqueeze(grad_points, 3);
-
-  EXEC_NPU_CMD(aclnnThreeInterpolateBackward, grad_x, idx, weight, m, grad_y);
+  auto grad_x = at::unsqueeze(grad_out, 3).to(at::kFloat);
+  auto grad_y = at::unsqueeze(grad_points, 3).to(at::kFloat);
+  auto weight_cast = weight.to(at::kFloat);
+  EXEC_NPU_CMD(aclnnThreeInterpolateBackward, grad_x, idx, weight_cast, m,
+               grad_y);
 
-  auto output = at::squeeze(grad_y, 3);
+  auto grad_y_cast = grad_y;
+  if (originDtype == at::kHalf) {
+    grad_y_cast = grad_y.to(at::kHalf);
+  }
+  auto output = at::squeeze(grad_y_cast, 3);
   auto res = output.contiguous();
   grad_points.copy_(res);
 }