[Codegen][GPU] Fix allocation space in iree_gpu.shuffle_tensor loweri…

…ng (#18250) The memory space for the destination of an `iree_gpu.shuffle_tensor` op must always be shared once lowered. Before lowering it is valid for it to be unspecified, but up until now the lowering was making no guarantee that we ended up with a shared memory space. This changes the memory space and re-enables private allocations from bufferization. This fixes any potential correctness problems arising from vectorization failures.
iree-org · Aug 17, 2024 · 7cf3fc6 · 7cf3fc6
1 parent b7efdff
commit 7cf3fc6
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 12 deletions.
diff --git a/...c/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/transform_lower_shuffle.mlir b/...c/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/transform_lower_shuffle.mlir
@@ -1,6 +1,7 @@
 // RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s
 
-func.func @shuffle_tensor(%init: tensor<6x6xf32>, %source: tensor<2x3xf32>, %x: index) -> tensor<3x2xf32> {
+func.func @shuffle_tensor(%source: tensor<2x3xf32>, %x: index) -> tensor<3x2xf32> {
+  %init = tensor.empty() : tensor<6x6xf32>
   %0 = iree_gpu.shuffle_tensor %source[%x, 0] [2, 3] [1, 1] to %init {
   ^bb0(%intermediate: tensor<6x6xf32>):
     %slice = tensor.extract_slice %intermediate[0, %x] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
@@ -20,10 +21,10 @@ module attributes { transform.with_named_sequence } {
 }
 
 // CHECK-LABEL: func @shuffle_tensor
-//  CHECK-SAME:   %[[INIT:[A-Za-z0-9]+]]: tensor<6x6xf32>
 //  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor<2x3xf32>
 //  CHECK-SAME:   %[[X:[A-Za-z0-9]+]]: index
 
+//       CHECK:   %[[INIT:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<6x6xf32>
 //       CHECK:   %[[IN:.+]] = tensor.insert_slice %[[ARG1]] into %[[INIT]][%[[X]], 0] [2, 3] [1, 1] : tensor<2x3xf32> into tensor<6x6xf32>
 //       CHECK:   %[[WRITE_BARRIER:.+]] = iree_gpu.value_barrier %[[IN]]
 //       CHECK:   %[[OUT:.+]] = tensor.extract_slice %[[WRITE_BARRIER]][0, %[[X]]] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
@@ -32,9 +33,10 @@ module attributes { transform.with_named_sequence } {
 
 // -----
 
-func.func @rank_reducing_shuffle_tensor(%init: tensor<1x6x6xf32>, %source: tensor<2x3xf32>, %x: index, %y: index) -> vector<3x2xf32> {
+func.func @rank_reducing_shuffle_tensor(%source: tensor<2x3xf32>, %x: index, %y: index) -> vector<3x2xf32> {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
+  %init = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x6x6xf32>
   %0 = iree_gpu.shuffle_tensor %source[0, %x, %y] [1, 2, 3] [1, 1, 1] to %init {
   ^bb0(%intermediate: tensor<1x6x6xf32>):
     %slice = tensor.extract_slice %intermediate[0, %y, %x] [1, 3, 2] [1, 1, 1] : tensor<1x6x6xf32> to tensor<3x2xf32>
@@ -55,11 +57,11 @@ module attributes { transform.with_named_sequence } {
 }
 
 // CHECK-LABEL: func @rank_reducing_shuffle_tensor
-//  CHECK-SAME:   %[[INIT:[A-Za-z0-9]+]]: tensor<1x6x6xf32>
 //  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor<2x3xf32>
 //  CHECK-SAME:   %[[X:[A-Za-z0-9]+]]: index
 //  CHECK-SAME:   %[[Y:[A-Za-z0-9]+]]: index
 
+//       CHECK:   %[[INIT:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x6x6xf32>
 //       CHECK:   %[[IN:.+]] = tensor.insert_slice %[[ARG1]] into %[[INIT]][0, %[[X]], %[[Y]]] [1, 2, 3] [1, 1, 1] : tensor<2x3xf32> into tensor<1x6x6xf32>
 //       CHECK:   %[[WRITE_BARRIER:.+]] = iree_gpu.value_barrier %[[IN]]
 //       CHECK:   %[[OUT:.+]] = tensor.extract_slice %[[WRITE_BARRIER]][0, %[[Y]], %[[X]]] [1, 3, 2] [1, 1, 1] : tensor<1x6x6xf32> to tensor<3x2xf32>
@@ -69,9 +71,10 @@ module attributes { transform.with_named_sequence } {
 
 // -----
 
-func.func @reshape_shuffle_tensor(%init: tensor<12x12xf32>, %source: tensor<2x3xf32>) -> vector<2x1x3x2xf32> {
+func.func @reshape_shuffle_tensor(%source: tensor<2x3xf32>) -> vector<2x1x3x2xf32> {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
+  %init = tensor.empty() : tensor<12x12xf32>
   %0 = iree_gpu.shuffle_tensor %source[0, 0] [2, 3] [1, 1] to %init {
   ^bb0(%intermediate: tensor<12x12xf32>):
     %expand = tensor.expand_shape %intermediate [[0, 1], [2, 3]] output_shape [4, 3, 3, 4] : tensor<12x12xf32> into tensor<4x3x3x4xf32>
@@ -92,9 +95,9 @@ module attributes { transform.with_named_sequence } {
 }
 
 // CHECK-LABEL: func @reshape_shuffle_tensor
-//  CHECK-SAME:   %[[INIT:[A-Za-z0-9]+]]: tensor<12x12xf32>
 //  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor<2x3xf32>
 
+//       CHECK:   %[[INIT:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<12x12xf32>
 //       CHECK:   %[[IN:.+]] = tensor.insert_slice %[[ARG1]] into %[[INIT]][0, 0] [2, 3] [1, 1] : tensor<2x3xf32> into tensor<12x12xf32>
 //       CHECK:   %[[WRITE_BARRIER:.+]] = iree_gpu.value_barrier %[[IN]]
 //       CHECK:   %[[EXPAND:.+]] = tensor.expand_shape %[[WRITE_BARRIER]]

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
@@ -923,13 +923,35 @@ struct LowerShuffleTensor
                                 PatternRewriter &rewriter) const final {
     Location loc = shuffleOp.getLoc();
 
+    Value dest = shuffleOp.getDest();
+    Attribute sharedMemoryAddrSpace = gpu::AddressSpaceAttr::get(
+        rewriter.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());
+
+    // If the destination is a tensor.empty, replace it with an alloc_tensor.
+    if (auto emptyDest = shuffleOp.getDest().getDefiningOp<tensor::EmptyOp>()) {
+      auto allocTensor = rewriter.create<bufferization::AllocTensorOp>(
+          emptyDest->getLoc(), emptyDest->getResultTypes()[0],
+          emptyDest.getDynamicSizes());
+      allocTensor.setMemorySpaceAttr(sharedMemoryAddrSpace);
+      dest = allocTensor.getResult();
+    } else {
+      // Otherwise, verify that the destination is already shared memory.
+      auto allocTensor = dest.getDefiningOp<bufferization::AllocTensorOp>();
+      if (!allocTensor || !allocTensor.getMemorySpace().has_value() ||
+          allocTensor.getMemorySpaceAttr() != sharedMemoryAddrSpace) {
+        return rewriter.notifyMatchFailure(
+            shuffleOp, "shuffle tensor op destination does not have shared "
+                       "memory address space.");
+      }
+    }
+
     // Step 1. Insert the source slice into the intermediate tensor.
     SmallVector<OpFoldResult, 4> sourceOffsets = shuffleOp.getMixedOffsets();
     SmallVector<OpFoldResult, 4> sourceSizes = shuffleOp.getMixedSizes();
     SmallVector<OpFoldResult, 4> sourceStrides = shuffleOp.getMixedStrides();
     Value insertedSlice = rewriter.create<tensor::InsertSliceOp>(
-        loc, shuffleOp.getSource(), shuffleOp.getDest(), sourceOffsets,
-        sourceSizes, sourceStrides);
+        loc, shuffleOp.getSource(), dest, sourceOffsets, sourceSizes,
+        sourceStrides);
 
     // Step 2. Synchronize the workers.
     auto writeBarrier =

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -385,10 +385,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager) {
   funcPassManager.addPass(createCleanupBufferAllocViewPass());
 
   // Step 7. Bufferize.
-  // TODO: This is a workaround for a bug in the lowering of
-  // `iree_gpu.shuffle_tensor` which does not properly represent the concurrent
-  // nature of the write to the intermediate tensor.
-  addBufferizePasses(funcPassManager, /*allowPrivateAllocations=*/false);
+  addBufferizePasses(funcPassManager, /*allowPrivateAllocations=*/true);
 
   // Step 8. Resolve remaining parallel loops.
   funcPassManager.addPass(createGPUVerifyDistributionPass());