TensorAccessor cleanup (#3973)

q10 · facebook-github-bot · commit ced0cc4cbbb7 · 2025-04-16T18:39:06.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1059 - The existing `tensor_accessor.h` duplicates a lot of code from `ATen/core/TensorAccessor.h`. This diff removes the duplication and simplifies the class template specializations by using SFINAE methods instead. - Add unit tests to check that the index checking works. Reviewed By: sryap Differential Revision: D72940055
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_accessor2.h b/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_accessor2.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <c10/core/ScalarType.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <cstddef>
+#include <cstdint>
+
+////////////////////////////////////////////////////////////////////////////////
+// Extended TensorAccessor
+//
+// This file contains TensorAccessor and PackedTensorAccessor implementations
+// that are used in FBGEMM_GPU for additional bounds checks that are not
+// available in the standard ATen implementation. Using the builder macro
+// MAKE_TA_WITH_NAME and MAKE_PTA_WITH_NAME, bounds checks can be enabled using
+// the FBGEMM_GPU_MEMCHECK flag.
+//
+//  https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/core/TensorAccessor.h
+//  https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/core/TensorBase.h
+////////////////////////////////////////////////////////////////////////////////
+
+namespace fbgemm_gpu::utils {
+
+template <typename T>
+using DefaultPtrTraits = at::DefaultPtrTraits<T>;
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename T>
+using RestrictPtrTraits = at::RestrictPtrTraits<T>;
+#endif
+
+static constexpr size_t NAME_MAX_LEN = 32;
+static constexpr size_t CONTEXT_MAX_LEN = 256;
+
+C10_HOST_DEVICE inline void
+copy_str(char* dst, const char* src, const size_t max_len) {
+  // If dst is nullptr, then skip.
+  if (dst == nullptr) {
+    return;
+  }
+
+  // If src is nullptr or max_len is zero, then mark empty string and skip.
+  if (src == nullptr || max_len == 0) {
+    dst[0] = '\0';
+    return;
+  }
+
+  // Count src buffer length up to max_len
+  size_t len = 0;
+  for (len = 0; src[len] != 0 && len < max_len; len++) {
+    // no action - calculating string length
+  }
+  len = len < (max_len - 1) ? len : (max_len - 1);
+
+  // Copy src to dst
+  for (size_t i = 0; i < len; i++) {
+    dst[i] = src[i];
+  }
+  dst[len] = '\0';
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// TensorAccessor
+//
+// This is an extension of at::TensorAccessorBase that consolidates some methods
+// defined in at::TensorAccessor.
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename T,
+    size_t N,
+    template <typename U> class PtrTraits = DefaultPtrTraits,
+    typename index_t = int64_t>
+class TensorAccessor : public at::TensorAccessorBase<T, N, PtrTraits, index_t> {
+ public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessor(
+      const PtrType data_,
+      const index_t* const sizes_,
+      const index_t* const strides_,
+      const char* const _name_,
+      const char* const _context_)
+      : at::TensorAccessorBase<T, N, PtrTraits, index_t>(
+            data_,
+            sizes_,
+            strides_) {
+    if (sizes_ && strides_) {
+      numel_ = 1;
+      for (size_t d = 0; d < N; d++) {
+        numel_ += (sizes_[d] - 1) * strides_[d];
+      }
+    }
+
+    copy_str(name_, _name_, NAME_MAX_LEN);
+    copy_str(context_, _context_, CONTEXT_MAX_LEN);
+  }
+
+  template <size_t M = N>
+  C10_HOST_DEVICE inline auto operator[](const index_t i)
+      -> std::
+          enable_if_t<(M > 1), TensorAccessor<T, N - 1, PtrTraits, index_t>> {
+    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
+        this->data_ + this->strides_[0] * i,
+        this->sizes_ + 1,
+        this->strides_ + 1,
+        this->name_,
+        this->context_);
+  }
+
+  template <size_t M = N>
+  C10_HOST_DEVICE inline auto operator[](const index_t i) const
+      -> std::enable_if_t<
+          (M > 1),
+          const TensorAccessor<T, N - 1, PtrTraits, index_t>> {
+    return TensorAccessor<T, N - 1, PtrTraits, index_t>(
+        this->data_ + this->strides_[0] * i,
+        this->sizes_ + 1,
+        this->strides_ + 1,
+        this->name_,
+        this->context_);
+  }
+
+  template <size_t M = N>
+  C10_HOST_DEVICE inline auto operator[](const index_t i)
+      -> std::enable_if_t<(M == 1), T&> {
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    return this->at(this->strides_[0] * i);
+  }
+
+  template <size_t M = N>
+  C10_HOST_DEVICE inline auto operator[](const index_t i) const
+      -> std::enable_if_t<(M == 1), const T&> {
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    return this->at(this->strides_[0] * i);
+  }
+
+  C10_HOST_DEVICE T& at(const index_t idx) const {
+    if (idx < 0) {
+      printf(
+          "[%s][Tensor %s] ERROR: (idx=%ld) < 0\n",
+          this->context_,
+          this->name_,
+          static_cast<int64_t>(idx));
+      CUDA_KERNEL_ASSERT(idx >= 0);
+
+    } else if (idx >= numel_) {
+      printf(
+          "[%s][Tensor %s] ERROR: (idx=%ld) >= (numel=%ld)\n",
+          this->context_,
+          this->name_,
+          static_cast<int64_t>(idx),
+          static_cast<int64_t>(numel_));
+      CUDA_KERNEL_ASSERT(idx < numel_);
+    }
+
+    return this->data_[idx];
+  }
+
+ protected:
+  size_t numel_;
+  char name_[NAME_MAX_LEN];
+  char context_[CONTEXT_MAX_LEN];
+};
+
+} // namespace fbgemm_gpu::utils
diff --git a/fbgemm_gpu/test/utils/kernel_launcher_test.cu b/fbgemm_gpu/test/utils/kernel_launcher_test.cu
@@ -307,7 +307,7 @@ TEST(KernelLauncherTest, kernel_launch_checks) {
 }
 
 // NOTE: This test currently fails in fbcode CI for HIP with the following
-// error:
+// error (but runs without issues on both NVIDIA and AMD machines):
 //
 // void fbgemm_gpu::utils::always_fail_assertion_kernel(const int,
 // c10::hip::DeviceAssertionsData *const, uint32_t): Device-side assertion `(a
diff --git a/fbgemm_gpu/test/utils/tensor_accessor2_test.cu b/fbgemm_gpu/test/utils/tensor_accessor2_test.cu
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <gtest/gtest.h>
+#include <torch/types.h> // @manual=//caffe2:torch-cpp-cpu
+
+#include "fbgemm_gpu/utils/tensor_accessor2.h"
+
+namespace fbgemm_gpu::utils {
+
+TEST(TensorAccessorTest, tensor_access) {
+  const auto tensor1 = torch::tensor(
+      {{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f, 1.6f, 1.7f},
+       {1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f, 1.6f, 1.7f}},
+      torch::kFloat32);
+
+  const auto tensor2 = torch::tensor(
+      {{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f, 1.6f, 1.7f},
+       {1.0f, 1.1f, 1.2f, 1.3f, 42.0f, 1.5f, 1.6f, 1.7f}},
+      torch::kFloat32);
+
+  auto accessor = TensorAccessor<float, 2, DefaultPtrTraits, int64_t>(
+      static_cast<typename DefaultPtrTraits<float>::PtrType>(
+          tensor1.data_ptr<float>()),
+      tensor1.sizes().data(),
+      tensor1.strides().data(),
+      "tensor",
+      "context");
+
+  // Accessor should work as expected
+  accessor[1][4] = 42.0f;
+
+  EXPECT_TRUE(torch::equal(tensor1, tensor1))
+      << "tensor1 is not equal to tensor2";
+
+#ifndef __HIPCC__
+  EXPECT_DEATH({ accessor[10][20] = 3.14f; }, "idx < numel_");
+#endif
+}
+
+} // namespace fbgemm_gpu::utils

Original file line number	Diff line number	Diff line change
`@@ -307,7 +307,7 @@ TEST(KernelLauncherTest, kernel_launch_checks) {`
`307`	`307`	`}`
`308`	`308`
`309`	`309`	`// NOTE: This test currently fails in fbcode CI for HIP with the following`
`310`		`-// error:`
	`310`	`+// error (but runs without issues on both NVIDIA and AMD machines):`
`311`	`311`	`//`
`312`	`312`	`// void fbgemm_gpu::utils::always_fail_assertion_kernel(const int,`
`313`	`313`	// c10::hip::DeviceAssertionsData *const, uint32_t): Device-side assertion `(a