Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SLP]Enable splat ordering for loads #115173

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

alexey-bataev
Copy link
Member

Enables splat support for loads with lanes> 2 or number of operands> 2.

Allows better detect splats of loads and reduces number of shuffles in
some cases.

X86, AVX512, -O3+LTO

Metric: size..text
results results0 diff
test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 154867.00 156723.00 1.2%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12467735.00 12468023.00 0.0%

Better vectorization quality

Created using spr 1.3.5
@llvmbot
Copy link
Collaborator

llvmbot commented Nov 6, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

Enables splat support for loads with lanes> 2 or number of operands> 2.

Allows better detect splats of loads and reduces number of shuffles in
some cases.

X86, AVX512, -O3+LTO

Metric: size..text
results results0 diff
test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 154867.00 156723.00 1.2%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12467735.00 12468023.00 0.0%

Better vectorization quality


Full diff: https://github.com/llvm/llvm-project/pull/115173.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+11-8)
  • (modified) llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll (+6-12)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4454eb3e34d983..bf7bc570000dde 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2386,6 +2386,9 @@ class BoUpSLP {
     /// the whole vector (it is mixed with constants or loop invariant values).
     /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
     bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+      // Small number of loads - try load matching.
+      if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
+        return false;
       bool OpAPO = getData(OpIdx, Lane).APO;
       bool IsInvariant = L && L->isLoopInvariant(Op);
       unsigned Cnt = 0;
@@ -2511,23 +2514,23 @@ class BoUpSLP {
         Value *OpLane0 = getValue(OpIdx, FirstLane);
         // Keep track if we have instructions with all the same opcode on one
         // side.
-        if (isa<LoadInst>(OpLane0))
-          ReorderingModes[OpIdx] = ReorderingMode::Load;
-        else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
+        if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
           // Check if OpLane0 should be broadcast.
           if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
               !canBeVectorized(OpILane0, OpIdx, FirstLane))
             ReorderingModes[OpIdx] = ReorderingMode::Splat;
+          else if (isa<LoadInst>(OpILane0))
+            ReorderingModes[OpIdx] = ReorderingMode::Load;
           else
             ReorderingModes[OpIdx] = ReorderingMode::Opcode;
-        } else if (isa<Constant>(OpLane0))
+        } else if (isa<Constant>(OpLane0)) {
           ReorderingModes[OpIdx] = ReorderingMode::Constant;
-        else if (isa<Argument>(OpLane0))
+        } else if (isa<Argument>(OpLane0)) {
           // Our best hope is a Splat. It may save some cost in some cases.
           ReorderingModes[OpIdx] = ReorderingMode::Splat;
-        else
-          // NOTE: This should be unreachable.
-          ReorderingModes[OpIdx] = ReorderingMode::Failed;
+        } else {
+          llvm_unreachable("Unexpected value kind.");
+        }
       }
 
       // Check that we don't have same operands. No need to reorder if operands
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
index 928cbe36554119..1e7cc9c268cfa1 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
@@ -8,19 +8,13 @@ define fastcc void @rephase(ptr %phases_in, ptr %157, i64 %158) {
 ; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[IMAG_247:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 408
-; CHECK-NEXT:    [[MUL35_248:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT:    store double [[MUL35_248]], ptr [[IMAG_247]], align 8
-; CHECK-NEXT:    [[ARRAYIDX23_1_249:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 416
-; CHECK-NEXT:    [[MUL_1_250:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT:    store double [[MUL_1_250]], ptr [[ARRAYIDX23_1_249]], align 8
 ; CHECK-NEXT:    [[IMAG_1_251:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 424
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT:    [[MUL35_1_252:%.*]] = fmul double [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    store double [[MUL35_1_252]], ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT:    [[ARRAYIDX23_2_253:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 432
-; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX23_2_253]], align 8
-; CHECK-NEXT:    [[MUL_2_254:%.*]] = fmul double [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store double [[MUL_2_254]], ptr [[ARRAYIDX23_2_253]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[IMAG_1_251]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <2 x double> [[TMP3]], i64 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    store <4 x double> [[TMP7]], ptr [[IMAG_247]], align 8
 ; CHECK-NEXT:    store double [[TMP2]], ptr [[PHASES_IN]], align 8
 ; CHECK-NEXT:    ret void
 ;

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 6, 2024

@llvm/pr-subscribers-vectorizers

Author: Alexey Bataev (alexey-bataev)

Changes

Enables splat support for loads with lanes> 2 or number of operands> 2.

Allows better detect splats of loads and reduces number of shuffles in
some cases.

X86, AVX512, -O3+LTO

Metric: size..text
results results0 diff
test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 154867.00 156723.00 1.2%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12467735.00 12468023.00 0.0%

Better vectorization quality


Full diff: https://github.com/llvm/llvm-project/pull/115173.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+11-8)
  • (modified) llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll (+6-12)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4454eb3e34d983..bf7bc570000dde 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2386,6 +2386,9 @@ class BoUpSLP {
     /// the whole vector (it is mixed with constants or loop invariant values).
     /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
     bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+      // Small number of loads - try load matching.
+      if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
+        return false;
       bool OpAPO = getData(OpIdx, Lane).APO;
       bool IsInvariant = L && L->isLoopInvariant(Op);
       unsigned Cnt = 0;
@@ -2511,23 +2514,23 @@ class BoUpSLP {
         Value *OpLane0 = getValue(OpIdx, FirstLane);
         // Keep track if we have instructions with all the same opcode on one
         // side.
-        if (isa<LoadInst>(OpLane0))
-          ReorderingModes[OpIdx] = ReorderingMode::Load;
-        else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
+        if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
           // Check if OpLane0 should be broadcast.
           if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
               !canBeVectorized(OpILane0, OpIdx, FirstLane))
             ReorderingModes[OpIdx] = ReorderingMode::Splat;
+          else if (isa<LoadInst>(OpILane0))
+            ReorderingModes[OpIdx] = ReorderingMode::Load;
           else
             ReorderingModes[OpIdx] = ReorderingMode::Opcode;
-        } else if (isa<Constant>(OpLane0))
+        } else if (isa<Constant>(OpLane0)) {
           ReorderingModes[OpIdx] = ReorderingMode::Constant;
-        else if (isa<Argument>(OpLane0))
+        } else if (isa<Argument>(OpLane0)) {
           // Our best hope is a Splat. It may save some cost in some cases.
           ReorderingModes[OpIdx] = ReorderingMode::Splat;
-        else
-          // NOTE: This should be unreachable.
-          ReorderingModes[OpIdx] = ReorderingMode::Failed;
+        } else {
+          llvm_unreachable("Unexpected value kind.");
+        }
       }
 
       // Check that we don't have same operands. No need to reorder if operands
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
index 928cbe36554119..1e7cc9c268cfa1 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
@@ -8,19 +8,13 @@ define fastcc void @rephase(ptr %phases_in, ptr %157, i64 %158) {
 ; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[IMAG_247:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 408
-; CHECK-NEXT:    [[MUL35_248:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT:    store double [[MUL35_248]], ptr [[IMAG_247]], align 8
-; CHECK-NEXT:    [[ARRAYIDX23_1_249:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 416
-; CHECK-NEXT:    [[MUL_1_250:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT:    store double [[MUL_1_250]], ptr [[ARRAYIDX23_1_249]], align 8
 ; CHECK-NEXT:    [[IMAG_1_251:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 424
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT:    [[MUL35_1_252:%.*]] = fmul double [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    store double [[MUL35_1_252]], ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT:    [[ARRAYIDX23_2_253:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 432
-; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX23_2_253]], align 8
-; CHECK-NEXT:    [[MUL_2_254:%.*]] = fmul double [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store double [[MUL_2_254]], ptr [[ARRAYIDX23_2_253]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[IMAG_1_251]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <2 x double> [[TMP3]], i64 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    store <4 x double> [[TMP7]], ptr [[IMAG_247]], align 8
 ; CHECK-NEXT:    store double [[TMP2]], ptr [[PHASES_IN]], align 8
 ; CHECK-NEXT:    ret void
 ;

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants