Skip to content

Commit

Permalink
[Codegen][GPU] Fix allocation space in iree_gpu.shuffle_tensor loweri…
Browse files Browse the repository at this point in the history
…ng (#18250)

The memory space for the destination of an `iree_gpu.shuffle_tensor` op
must always be shared once lowered. Before lowering it is valid for it
to be unspecified, but up until now the lowering was making no guarantee
that we ended up with a shared memory space. This changes the memory
space and re-enables private allocations from bufferization. This fixes
any potential correctness problems arising from vectorization failures.
  • Loading branch information
qedawkins authored Aug 17, 2024
1 parent b7efdff commit 7cf3fc6
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 12 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s

func.func @shuffle_tensor(%init: tensor<6x6xf32>, %source: tensor<2x3xf32>, %x: index) -> tensor<3x2xf32> {
func.func @shuffle_tensor(%source: tensor<2x3xf32>, %x: index) -> tensor<3x2xf32> {
%init = tensor.empty() : tensor<6x6xf32>
%0 = iree_gpu.shuffle_tensor %source[%x, 0] [2, 3] [1, 1] to %init {
^bb0(%intermediate: tensor<6x6xf32>):
%slice = tensor.extract_slice %intermediate[0, %x] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
Expand All @@ -20,10 +21,10 @@ module attributes { transform.with_named_sequence } {
}

// CHECK-LABEL: func @shuffle_tensor
// CHECK-SAME: %[[INIT:[A-Za-z0-9]+]]: tensor<6x6xf32>
// CHECK-SAME: %[[ARG1:[A-Za-z0-9]+]]: tensor<2x3xf32>
// CHECK-SAME: %[[X:[A-Za-z0-9]+]]: index

// CHECK: %[[INIT:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<6x6xf32>
// CHECK: %[[IN:.+]] = tensor.insert_slice %[[ARG1]] into %[[INIT]][%[[X]], 0] [2, 3] [1, 1] : tensor<2x3xf32> into tensor<6x6xf32>
// CHECK: %[[WRITE_BARRIER:.+]] = iree_gpu.value_barrier %[[IN]]
// CHECK: %[[OUT:.+]] = tensor.extract_slice %[[WRITE_BARRIER]][0, %[[X]]] [3, 2] [1, 1] : tensor<6x6xf32> to tensor<3x2xf32>
Expand All @@ -32,9 +33,10 @@ module attributes { transform.with_named_sequence } {

// -----

func.func @rank_reducing_shuffle_tensor(%init: tensor<1x6x6xf32>, %source: tensor<2x3xf32>, %x: index, %y: index) -> vector<3x2xf32> {
func.func @rank_reducing_shuffle_tensor(%source: tensor<2x3xf32>, %x: index, %y: index) -> vector<3x2xf32> {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.0 : f32
%init = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x6x6xf32>
%0 = iree_gpu.shuffle_tensor %source[0, %x, %y] [1, 2, 3] [1, 1, 1] to %init {
^bb0(%intermediate: tensor<1x6x6xf32>):
%slice = tensor.extract_slice %intermediate[0, %y, %x] [1, 3, 2] [1, 1, 1] : tensor<1x6x6xf32> to tensor<3x2xf32>
Expand All @@ -55,11 +57,11 @@ module attributes { transform.with_named_sequence } {
}

// CHECK-LABEL: func @rank_reducing_shuffle_tensor
// CHECK-SAME: %[[INIT:[A-Za-z0-9]+]]: tensor<1x6x6xf32>
// CHECK-SAME: %[[ARG1:[A-Za-z0-9]+]]: tensor<2x3xf32>
// CHECK-SAME: %[[X:[A-Za-z0-9]+]]: index
// CHECK-SAME: %[[Y:[A-Za-z0-9]+]]: index

// CHECK: %[[INIT:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x6x6xf32>
// CHECK: %[[IN:.+]] = tensor.insert_slice %[[ARG1]] into %[[INIT]][0, %[[X]], %[[Y]]] [1, 2, 3] [1, 1, 1] : tensor<2x3xf32> into tensor<1x6x6xf32>
// CHECK: %[[WRITE_BARRIER:.+]] = iree_gpu.value_barrier %[[IN]]
// CHECK: %[[OUT:.+]] = tensor.extract_slice %[[WRITE_BARRIER]][0, %[[Y]], %[[X]]] [1, 3, 2] [1, 1, 1] : tensor<1x6x6xf32> to tensor<3x2xf32>
Expand All @@ -69,9 +71,10 @@ module attributes { transform.with_named_sequence } {

// -----

func.func @reshape_shuffle_tensor(%init: tensor<12x12xf32>, %source: tensor<2x3xf32>) -> vector<2x1x3x2xf32> {
func.func @reshape_shuffle_tensor(%source: tensor<2x3xf32>) -> vector<2x1x3x2xf32> {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.0 : f32
%init = tensor.empty() : tensor<12x12xf32>
%0 = iree_gpu.shuffle_tensor %source[0, 0] [2, 3] [1, 1] to %init {
^bb0(%intermediate: tensor<12x12xf32>):
%expand = tensor.expand_shape %intermediate [[0, 1], [2, 3]] output_shape [4, 3, 3, 4] : tensor<12x12xf32> into tensor<4x3x3x4xf32>
Expand All @@ -92,9 +95,9 @@ module attributes { transform.with_named_sequence } {
}

// CHECK-LABEL: func @reshape_shuffle_tensor
// CHECK-SAME: %[[INIT:[A-Za-z0-9]+]]: tensor<12x12xf32>
// CHECK-SAME: %[[ARG1:[A-Za-z0-9]+]]: tensor<2x3xf32>

// CHECK: %[[INIT:.+]] = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<12x12xf32>
// CHECK: %[[IN:.+]] = tensor.insert_slice %[[ARG1]] into %[[INIT]][0, 0] [2, 3] [1, 1] : tensor<2x3xf32> into tensor<12x12xf32>
// CHECK: %[[WRITE_BARRIER:.+]] = iree_gpu.value_barrier %[[IN]]
// CHECK: %[[EXPAND:.+]] = tensor.expand_shape %[[WRITE_BARRIER]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -923,13 +923,35 @@ struct LowerShuffleTensor
PatternRewriter &rewriter) const final {
Location loc = shuffleOp.getLoc();

Value dest = shuffleOp.getDest();
Attribute sharedMemoryAddrSpace = gpu::AddressSpaceAttr::get(
rewriter.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());

// If the destination is a tensor.empty, replace it with an alloc_tensor.
if (auto emptyDest = shuffleOp.getDest().getDefiningOp<tensor::EmptyOp>()) {
auto allocTensor = rewriter.create<bufferization::AllocTensorOp>(
emptyDest->getLoc(), emptyDest->getResultTypes()[0],
emptyDest.getDynamicSizes());
allocTensor.setMemorySpaceAttr(sharedMemoryAddrSpace);
dest = allocTensor.getResult();
} else {
// Otherwise, verify that the destination is already shared memory.
auto allocTensor = dest.getDefiningOp<bufferization::AllocTensorOp>();
if (!allocTensor || !allocTensor.getMemorySpace().has_value() ||
allocTensor.getMemorySpaceAttr() != sharedMemoryAddrSpace) {
return rewriter.notifyMatchFailure(
shuffleOp, "shuffle tensor op destination does not have shared "
"memory address space.");
}
}

// Step 1. Insert the source slice into the intermediate tensor.
SmallVector<OpFoldResult, 4> sourceOffsets = shuffleOp.getMixedOffsets();
SmallVector<OpFoldResult, 4> sourceSizes = shuffleOp.getMixedSizes();
SmallVector<OpFoldResult, 4> sourceStrides = shuffleOp.getMixedStrides();
Value insertedSlice = rewriter.create<tensor::InsertSliceOp>(
loc, shuffleOp.getSource(), shuffleOp.getDest(), sourceOffsets,
sourceSizes, sourceStrides);
loc, shuffleOp.getSource(), dest, sourceOffsets, sourceSizes,
sourceStrides);

// Step 2. Synchronize the workers.
auto writeBarrier =
Expand Down
5 changes: 1 addition & 4 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,10 +385,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager) {
funcPassManager.addPass(createCleanupBufferAllocViewPass());

// Step 7. Bufferize.
// TODO: This is a workaround for a bug in the lowering of
// `iree_gpu.shuffle_tensor` which does not properly represent the concurrent
// nature of the write to the intermediate tensor.
addBufferizePasses(funcPassManager, /*allowPrivateAllocations=*/false);
addBufferizePasses(funcPassManager, /*allowPrivateAllocations=*/true);

// Step 8. Resolve remaining parallel loops.
funcPassManager.addPass(createGPUVerifyDistributionPass());
Expand Down

0 comments on commit 7cf3fc6

Please sign in to comment.