[NPU] fix the npu code for knn and three nn ops (#3269)

huangyuan64 · web-flow · commit a3d52747f306 · 2025-04-01T22:21:05.000+08:00
diff --git a/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp b/mmcv/ops/csrc/pytorch/npu/knn_npu.cpp
@@ -8,7 +8,7 @@ using namespace std;
 void knn_forward_npu(int b, int n, int m, int nsample, const Tensor xyz,
                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
   // transpose known from [B, N, 3] to [B, 3, N]
-  at::Tensor source = xyz.transpose(1, 2).contiguous();
+  at::Tensor source = xyz.transpose(2, 1).contiguous();
   at::Tensor target = new_xyz.contiguous();
 
   bool is_from_knn = true;
diff --git a/mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp b/mmcv/ops/csrc/pytorch/npu/three_nn_npu.cpp
@@ -7,21 +7,12 @@ using namespace std;
 
 void three_nn_forward_npu(int b, int n, int m, const Tensor unknown,
                           const Tensor known, Tensor dist2, Tensor idx) {
-  // transpose known  [B, N, 3] -> [B, 3, N]
-  at::Tensor source = known.transpose(1, 2).contiguous();
+  at::Tensor source = known.contiguous();
   at::Tensor target = unknown.contiguous();
-  auto originDtype = source.scalar_type();
-  if (originDtype == at::kHalf) {
-    source = source.to(at::kFloat);
-    target = target.to(at::kFloat);
-  }
 
   bool is_from_knn = false;
-  uint32_t nsample = 3;
+  int nsample = 3;
   EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, nsample, dist2, idx);
-  if (originDtype == at::kHalf) {
-    dist2 = dist2.to(at::kHalf);
-  }
 }
 
 void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
diff --git a/mmcv/ops/knn.py b/mmcv/ops/knn.py
@@ -66,6 +66,17 @@ def forward(ctx,
         B, npoint, _ = center_xyz.shape
         N = xyz.shape[1]
 
+        if xyz.device.type == 'npu':
+            dist2 = center_xyz.new_zeros((B, npoint, k)).float()
+            idx = center_xyz.new_zeros((B, npoint, k)).int()
+            ext_module.knn_forward(
+                xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k)
+            zeros_idx = torch.zeros(
+                xyz.shape[0], center_xyz.shape[1], k, dtype=torch.int32).npu()
+            idx.where(dist2 >= 1e10, zeros_idx)
+            idx = idx.transpose(2, 1).contiguous()  # [B, k, npoint]
+            return idx.int()
+
         idx = center_xyz.new_zeros((B, npoint, k)).int()
         dist2 = center_xyz.new_zeros((B, npoint, k)).float()
 
diff --git a/mmcv/ops/three_nn.py b/mmcv/ops/three_nn.py
@@ -34,6 +34,21 @@ def forward(ctx: Any, target: torch.Tensor,
 
         B, N, _ = target.size()
         m = source.size(1)
+        if source.device.type == 'npu':
+            # strict to fp32
+            source = source.transpose(2, 1).contiguous()
+            dtype_ = source.dtype
+            if dtype_ == torch.float16:
+                target = target.float()
+                source = source.float()
+            dist2 = target.new_empty(B, N, 3)
+            idx = target.new_empty(B, N, 3, dtype=torch.int32)
+            ext_module.three_nn_forward(
+                target, source, dist2, idx, b=B, n=N, m=m)
+            dist2 = torch.sqrt(dist2)
+            if dtype_ == torch.float16:
+                dist2 = dist2.half()
+            return dist2, idx.int()
         dist2 = target.new_empty(B, N, 3)
         idx = target.new_empty(B, N, 3, dtype=torch.int32)