-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Enable splat ordering for loads #115173
base: main
Are you sure you want to change the base?
[SLP]Enable splat ordering for loads #115173
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesEnables splat support for loads with lanes> 2 or number of operands> 2. Allows better detect splats of loads and reduces number of shuffles in X86, AVX512, -O3+LTO Metric: size..text Better vectorization quality Full diff: https://github.com/llvm/llvm-project/pull/115173.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4454eb3e34d983..bf7bc570000dde 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2386,6 +2386,9 @@ class BoUpSLP {
/// the whole vector (it is mixed with constants or loop invariant values).
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+ // Small number of loads - try load matching.
+ if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
+ return false;
bool OpAPO = getData(OpIdx, Lane).APO;
bool IsInvariant = L && L->isLoopInvariant(Op);
unsigned Cnt = 0;
@@ -2511,23 +2514,23 @@ class BoUpSLP {
Value *OpLane0 = getValue(OpIdx, FirstLane);
// Keep track if we have instructions with all the same opcode on one
// side.
- if (isa<LoadInst>(OpLane0))
- ReorderingModes[OpIdx] = ReorderingMode::Load;
- else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
+ if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
// Check if OpLane0 should be broadcast.
if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
!canBeVectorized(OpILane0, OpIdx, FirstLane))
ReorderingModes[OpIdx] = ReorderingMode::Splat;
+ else if (isa<LoadInst>(OpILane0))
+ ReorderingModes[OpIdx] = ReorderingMode::Load;
else
ReorderingModes[OpIdx] = ReorderingMode::Opcode;
- } else if (isa<Constant>(OpLane0))
+ } else if (isa<Constant>(OpLane0)) {
ReorderingModes[OpIdx] = ReorderingMode::Constant;
- else if (isa<Argument>(OpLane0))
+ } else if (isa<Argument>(OpLane0)) {
// Our best hope is a Splat. It may save some cost in some cases.
ReorderingModes[OpIdx] = ReorderingMode::Splat;
- else
- // NOTE: This should be unreachable.
- ReorderingModes[OpIdx] = ReorderingMode::Failed;
+ } else {
+ llvm_unreachable("Unexpected value kind.");
+ }
}
// Check that we don't have same operands. No need to reorder if operands
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
index 928cbe36554119..1e7cc9c268cfa1 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
@@ -8,19 +8,13 @@ define fastcc void @rephase(ptr %phases_in, ptr %157, i64 %158) {
; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8
; CHECK-NEXT: [[IMAG_247:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 408
-; CHECK-NEXT: [[MUL35_248:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT: store double [[MUL35_248]], ptr [[IMAG_247]], align 8
-; CHECK-NEXT: [[ARRAYIDX23_1_249:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 416
-; CHECK-NEXT: [[MUL_1_250:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT: store double [[MUL_1_250]], ptr [[ARRAYIDX23_1_249]], align 8
; CHECK-NEXT: [[IMAG_1_251:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 424
-; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT: [[MUL35_1_252:%.*]] = fmul double [[TMP2]], [[TMP3]]
-; CHECK-NEXT: store double [[MUL35_1_252]], ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT: [[ARRAYIDX23_2_253:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 432
-; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX23_2_253]], align 8
-; CHECK-NEXT: [[MUL_2_254:%.*]] = fmul double [[TMP2]], [[TMP4]]
-; CHECK-NEXT: store double [[MUL_2_254]], ptr [[ARRAYIDX23_2_253]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[IMAG_1_251]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <2 x double> [[TMP3]], i64 2)
+; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: store <4 x double> [[TMP7]], ptr [[IMAG_247]], align 8
; CHECK-NEXT: store double [[TMP2]], ptr [[PHASES_IN]], align 8
; CHECK-NEXT: ret void
;
|
@llvm/pr-subscribers-vectorizers Author: Alexey Bataev (alexey-bataev) ChangesEnables splat support for loads with lanes> 2 or number of operands> 2. Allows better detect splats of loads and reduces number of shuffles in X86, AVX512, -O3+LTO Metric: size..text Better vectorization quality Full diff: https://github.com/llvm/llvm-project/pull/115173.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4454eb3e34d983..bf7bc570000dde 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2386,6 +2386,9 @@ class BoUpSLP {
/// the whole vector (it is mixed with constants or loop invariant values).
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+ // Small number of loads - try load matching.
+ if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
+ return false;
bool OpAPO = getData(OpIdx, Lane).APO;
bool IsInvariant = L && L->isLoopInvariant(Op);
unsigned Cnt = 0;
@@ -2511,23 +2514,23 @@ class BoUpSLP {
Value *OpLane0 = getValue(OpIdx, FirstLane);
// Keep track if we have instructions with all the same opcode on one
// side.
- if (isa<LoadInst>(OpLane0))
- ReorderingModes[OpIdx] = ReorderingMode::Load;
- else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
+ if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
// Check if OpLane0 should be broadcast.
if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
!canBeVectorized(OpILane0, OpIdx, FirstLane))
ReorderingModes[OpIdx] = ReorderingMode::Splat;
+ else if (isa<LoadInst>(OpILane0))
+ ReorderingModes[OpIdx] = ReorderingMode::Load;
else
ReorderingModes[OpIdx] = ReorderingMode::Opcode;
- } else if (isa<Constant>(OpLane0))
+ } else if (isa<Constant>(OpLane0)) {
ReorderingModes[OpIdx] = ReorderingMode::Constant;
- else if (isa<Argument>(OpLane0))
+ } else if (isa<Argument>(OpLane0)) {
// Our best hope is a Splat. It may save some cost in some cases.
ReorderingModes[OpIdx] = ReorderingMode::Splat;
- else
- // NOTE: This should be unreachable.
- ReorderingModes[OpIdx] = ReorderingMode::Failed;
+ } else {
+ llvm_unreachable("Unexpected value kind.");
+ }
}
// Check that we don't have same operands. No need to reorder if operands
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
index 928cbe36554119..1e7cc9c268cfa1 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
@@ -8,19 +8,13 @@ define fastcc void @rephase(ptr %phases_in, ptr %157, i64 %158) {
; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8
; CHECK-NEXT: [[IMAG_247:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 408
-; CHECK-NEXT: [[MUL35_248:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT: store double [[MUL35_248]], ptr [[IMAG_247]], align 8
-; CHECK-NEXT: [[ARRAYIDX23_1_249:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 416
-; CHECK-NEXT: [[MUL_1_250:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT: store double [[MUL_1_250]], ptr [[ARRAYIDX23_1_249]], align 8
; CHECK-NEXT: [[IMAG_1_251:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 424
-; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT: [[MUL35_1_252:%.*]] = fmul double [[TMP2]], [[TMP3]]
-; CHECK-NEXT: store double [[MUL35_1_252]], ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT: [[ARRAYIDX23_2_253:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 432
-; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX23_2_253]], align 8
-; CHECK-NEXT: [[MUL_2_254:%.*]] = fmul double [[TMP2]], [[TMP4]]
-; CHECK-NEXT: store double [[MUL_2_254]], ptr [[ARRAYIDX23_2_253]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[IMAG_1_251]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <2 x double> [[TMP3]], i64 2)
+; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: store <4 x double> [[TMP7]], ptr [[IMAG_247]], align 8
; CHECK-NEXT: store double [[TMP2]], ptr [[PHASES_IN]], align 8
; CHECK-NEXT: ret void
;
|
Enables splat support for loads with lanes> 2 or number of operands> 2.
Allows better detect splats of loads and reduces number of shuffles in
some cases.
X86, AVX512, -O3+LTO
Metric: size..text
results results0 diff
test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 154867.00 156723.00 1.2%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12467735.00 12468023.00 0.0%
Better vectorization quality