diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUNestedLayoutDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUNestedLayoutDistributionPatterns.cpp index 164d900a1c71..31f416784453 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUNestedLayoutDistributionPatterns.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUNestedLayoutDistributionPatterns.cpp @@ -200,13 +200,20 @@ struct DistributeTransferRead final rewriter, indices, offsets, vectorLayout, readOp.getPermutationMap(), warpIndices, threadIndices); - Value slicedRead = rewriter.create( + VectorValue slicedRead = rewriter.create( readOp.getLoc(), innerVectorType, readOp.getSource(), slicedIndices, readOp.getPermutationMapAttr(), readOp.getPadding(), readOp.getMask(), readOp.getInBoundsAttr()); - acc = rewriter.create( - readOp.getLoc(), slicedRead, acc, offsets, strides); + if (acc.getType().getRank() == 0) { + // TODO: This should really be a folding pattern in + // insert_strided_slice, but instead insert_strided_slice just doesn't + // support 0-d vectors... + acc = slicedRead; + } else { + acc = rewriter.create( + readOp.getLoc(), slicedRead, acc, offsets, strides); + } } replaceOpWithDistributedValues(rewriter, readOp, acc); diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir index 98455c93f3e0..f33c91fe3754 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir @@ -331,6 +331,42 @@ builtin.module attributes { transform.with_named_sequence } { // ----- +#layout = #iree_vector_ext.nested_layout< + subgroup_tile = [], + batch_tile = [], + outer_tile = [], + thread_tile = [], + element_tile = [], + + subgroup_strides = [], + thread_strides = [] +> + +// CHECK-LABEL: @distribute_transfer_read_0d +func.func @distribute_transfer_read_0d(%arg0: memref<128xf16>) -> vector { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f16 + %root = vector.transfer_read %arg0[%c0], %cst + {in_bounds = []} : memref<128xf16>, vector + %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector + func.return %rootl : vector +} + + +builtin.module attributes { transform.with_named_sequence } { + transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { + %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op + transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op + transform.yield + } +} + +// CHECK: %[[RD:.+]] = vector.transfer_read %{{.*}}[%c0] +// CHECK-SAME: memref<128xf16>, vector +// CHECK: iree_vector_ext.to_simd %[[RD]] + +// ----- + #layout_row_major = #iree_vector_ext.nested_layout< subgroup_tile = [1, 1], batch_tile = [2, 2],