iree-org · c-rhodes · Aug 16, 2024 · Aug 9, 2024 · Aug 13, 2024 · hanhanW
@@ -106,7 +106,7 @@ void LLVMCPULowerExecutableTargetPass::runOnOperation() {
   pipelineOpts.enableVectorMasking =
       isX86(target) || isRISCV(target) ||
       (isAArch64(target) && hasAnySVEFeature(target));
-  pipelineOpts.enableAArch64SSVE =
+  pipelineOpts.enableAArch64SME =
       isAArch64(target) && hasAnySVEFeature(target) && hasSMEFeature(target);
   pipelineOpts.enableAArch64I8mm = isAArch64(target) && hasI8mmFeature(target);
   pipelineOpts.enablePeeling = isLoopPeelingEnabled(funcOp);

@@ -44,6 +44,10 @@ void LLVMCPUVectorTransferLoweringPass::runOnOperation() {
                                                  /*maxTransferRank=*/1);
   auto vectorTransferToSCFOptions =
       VectorTransferToSCFOptions().enableFullUnroll();
+  if (enableScalableLowerings) {
+    vectorTransferToSCFOptions.enableLowerScalable();
+  }
+
   populateVectorToSCFConversionPatterns(patterns, vectorTransferToSCFOptions);
   (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
 }

@@ -306,7 +306,15 @@ void buildLLVMCPUVectorLoweringPipeline(
   // lower them and can't be optimized away anymore.
   funcPassManager.addPass(createCanonicalizerPass());
 
-  funcPassManager.addPass(createLLVMCPUVectorTransferLoweringPass());
+  LLVMCPUVectorTransferLoweringPassOptions transferLoweringOptions{};
+  if (!options.enableArmSME) {
+    // The ArmSME dialect has its own (more specific) lowerings for scalable
+    // vectors that occur later in the pipeline, so only enable the general
+    // lowerings if SME is not available.
+    transferLoweringOptions.enableScalableLowerings = true;
+  }
+  funcPassManager.addPass(
+      createLLVMCPUVectorTransferLoweringPass(transferLoweringOptions));
   funcPassManager.addPass(createLLVMCPUVectorTransposeLoweringPass(
       LLVMCPUVectorTransposeLoweringPassOptions{
           options.lowerVectorTransposeToAVX2}));
@@ -354,6 +362,7 @@ void addCPUBufferOpsTileAndVectorizePipeline(
     options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
     options.splitVectorTransfersTo = "linalg-copy";
     options.enableArmI8mm = pipelineOpt.enableAArch64I8mm;
+    options.enableArmSME = pipelineOpt.enableAArch64SME;
     buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
   }
 }
@@ -396,7 +405,7 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
     funcPassManager.addPass(createLLVMCPUPeelPass());
   }
 
-  if (pipelineOpt.enableAArch64SSVE) {
+  if (pipelineOpt.enableAArch64SME) {
     funcPassManager.addPass(createLLVMCPU2DScalableTo1DScalablePass());
   }
 
@@ -432,6 +441,7 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
     options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
     options.splitVectorTransfersTo = "linalg-copy";
     options.enableArmI8mm = pipelineOpt.enableAArch64I8mm;
+    options.enableArmSME = pipelineOpt.enableAArch64SME;
     buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
   }
 }
@@ -494,6 +504,7 @@ void addConvTileAndDecomposeExpertPassPipeline(
     options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
     options.splitVectorTransfersTo = "shuffle";
     options.enableArmI8mm = pipelineOpt.enableAArch64I8mm;
+    options.enableArmSME = pipelineOpt.enableAArch64SME;
     buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
   }
 }
@@ -542,6 +553,7 @@ void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
   options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
   options.splitVectorTransfersTo = "linalg-copy";
   options.enableArmI8mm = pipelineOpt.enableAArch64I8mm;
+  options.enableArmSME = pipelineOpt.enableAArch64SME;
   buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
 }
 
@@ -583,6 +595,7 @@ void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
     options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
     options.splitVectorTransfersTo = "linalg-copy";
     options.enableArmI8mm = pipelineOpt.enableAArch64I8mm;
+    options.enableArmSME = pipelineOpt.enableAArch64SME;
     buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
   }
 }
@@ -623,6 +636,7 @@ void addCPULinalgExtTileAndVectorizePipeline(
     options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
     options.splitVectorTransfersTo = "linalg-copy";
     options.enableArmI8mm = pipelineOpt.enableAArch64I8mm;
+    options.enableArmSME = pipelineOpt.enableAArch64SME;
     buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
   }
 }

@@ -27,6 +27,7 @@ struct LLVMCPUVectorLoweringPassOptions {
   std::string splitVectorTransfersTo = "";
   bool lowerVectorTransposeToAVX2 = false;
   bool enableArmI8mm = false;
+  bool enableArmSME = false;
 };
 
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
@@ -72,7 +73,7 @@ struct LLVMCPUPipelineOptions {
   bool useConfiguredVectorSizes = true;
   bool enablePeeling = false;
   bool enableVectorMasking = false;
-  bool enableAArch64SSVE = false;
+  bool enableAArch64SME = false;
   bool enableAArch64I8mm = false;
   bool lowerToAVX2 = false;
 };

@@ -213,6 +213,11 @@ def LLVMCPUVirtualVectorLoweringPass :
 def LLVMCPUVectorTransferLoweringPass :
     InterfacePass<"iree-llvmcpu-vector-transfer-lowering", "mlir::FunctionOpInterface"> {
   let summary = "Pass to lower transfer ops to simpler ops like `vector.load`, `vector.store`, `vector.broadcast`, and a set of scf ops.";
+  let options = [
+    Option<"enableScalableLowerings", "enable-scalable-lowerings", "bool",
+      /*default=*/"false",
+      "Enables scalable vector specific transfer lowerings">,
+  ];
 }
 
 def LLVMCPUVectorTransposeLoweringPass :

@@ -212,3 +212,19 @@ func.func @gather_strided_memref() {
 // CHECK-LABEL: func.func @gather_strided_memref
 // CHECK-NOT: memref.subview {{.*}} : memref<2592000xf32, strided<[3]>
 // CHECK-NOT: vector.gather %subview[%c0] [%7], %cst_0, %cst : memref<2592000xf32, strided<[3]>
+
+// -----
+
+func.func @scalable_transpose_store(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
+  %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
+  vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x4xf32>,  memref<?x?xf32>
+  return
+}
+
+/// Note: The lowering for this is implemented/tested upstream (this just checks
+/// it is enabled in IREE).
+
+// CHECK-LABEL: func.func @scalable_transpose_store
+// CHECK-NOT: vector.transpose
+// CHECK: vector.store {{.*}} : memref<?x?xf32>, vector<4xf32>
+// CHECK-NOT: vector.transpose