-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[WebAssembly] Optimize convert_iKxN_u into convert_iKxN_s #149609
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-webassembly Author: Arseny Kapoulkine (zeux) Changesconvert_iKxN_s is canonicalized into convert_iKxN_u when the argument is known to have sign bit 0. This results in emitting Wasm opcodes that, on some targets (like x86_64), are dramatically slower than signed versions on major engines. Similarly to X86, we now fix this up in isel when the instruction has nonneg flag from canonicalization or if we know the source has zero sign bit. Fixes #149457. Full diff: https://github.com/llvm/llvm-project/pull/149609.diff 3 Files Affected:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index bf2e04caa0a61..aa68f6f647918 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2934,6 +2934,25 @@ performVectorExtendToFPCombine(SDNode *N,
return DAG.getNode(N->getOpcode(), SDLoc(N), ResVT, Conv);
}
+static SDValue
+performVectorNonNegToFPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ auto &DAG = DCI.DAG;
+
+ SDNodeFlags Flags = N->getFlags();
+ SDValue Op0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ // Optimize uitofp to sitofp when the sign bit is known to be zero.
+ // Depending on the target (runtime) backend, this might be performance
+ // neutral (e.g. AArch64) or a significant improvement (e.g. x86_64).
+ if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
+ return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
+ }
+
+ return SDValue();
+}
+
static SDValue
performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
auto &DAG = DCI.DAG;
@@ -3515,6 +3534,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ZERO_EXTEND:
return performVectorExtendCombine(N, DCI);
case ISD::UINT_TO_FP:
+ if (auto ExtCombine = performVectorExtendToFPCombine(N, DCI))
+ return ExtCombine;
+ return performVectorNonNegToFPCombine(N, DCI);
case ISD::SINT_TO_FP:
return performVectorExtendToFPCombine(N, DCI);
case ISD::FP_TO_SINT_SAT:
diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
index 8459ec8101ff2..b355a0d60317b 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
@@ -441,3 +441,31 @@ define <2 x double> @promote_mixed_v2f64(<4 x float> %x, <4 x float> %y) {
%a = fpext <2 x float> %v to <2 x double>
ret <2 x double> %a
}
+
+define <4 x float> @convert_u_v4f32_maybeneg(<4 x i32> %x) {
+; CHECK-LABEL: convert_u_v4f32_maybeneg:
+; CHECK: .functype convert_u_v4f32_maybeneg (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: f32x4.convert_i32x4_u
+; CHECK-NEXT: # fallthrough-return
+ %a = ashr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+ %b = uitofp <4 x i32> %a to <4 x float>
+ ret <4 x float> %b
+}
+
+define <4 x float> @convert_u_v4f32_nonneg(<4 x i32> %x) {
+; CHECK-LABEL: convert_u_v4f32_nonneg:
+; CHECK: .functype convert_u_v4f32_nonneg (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32x4.shr_u
+; CHECK-NEXT: f32x4.convert_i32x4_s
+; CHECK-NEXT: # fallthrough-return
+ %a = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+ %b = uitofp <4 x i32> %a to <4 x float>
+ ret <4 x float> %b
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
index c93b8aa7fb42e..eb39f90e68701 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
@@ -12,7 +12,7 @@ define <4 x float> @extend_to_float_low_i16x8_u(<8 x i16> %x) {
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32x4.extend_low_i16x8_u
-; CHECK-NEXT: f32x4.convert_i32x4_u
+; CHECK-NEXT: f32x4.convert_i32x4_s
; CHECK-NEXT: # fallthrough-return
%low = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extended = uitofp <4 x i16> %low to <4 x float>
@@ -25,7 +25,7 @@ define <4 x float> @extend_to_float_high_i16x8_u(<8 x i16> %x) {
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32x4.extend_high_i16x8_u
-; CHECK-NEXT: f32x4.convert_i32x4_u
+; CHECK-NEXT: f32x4.convert_i32x4_s
; CHECK-NEXT: # fallthrough-return
%high = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%extended = uitofp <4 x i16> %high to <4 x float>
@@ -39,7 +39,7 @@ define <4 x float> @extend_to_float_low_i8x16_u(<8 x i8> %x) {
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i16x8.extend_low_i8x16_u
; CHECK-NEXT: i32x4.extend_low_i16x8_u
-; CHECK-NEXT: f32x4.convert_i32x4_u
+; CHECK-NEXT: f32x4.convert_i32x4_s
; CHECK-NEXT: # fallthrough-return
%low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extended = uitofp <4 x i8> %low to <4 x float>
@@ -55,7 +55,7 @@ define <4 x float> @extend_to_float_high_i8x16_u(<8 x i8> %x) {
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i16x8.extend_low_i8x16_u
; CHECK-NEXT: i32x4.extend_low_i16x8_u
-; CHECK-NEXT: f32x4.convert_i32x4_u
+; CHECK-NEXT: f32x4.convert_i32x4_s
; CHECK-NEXT: # fallthrough-return
%high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%extended = uitofp <4 x i8> %high to <4 x float>
@@ -136,7 +136,7 @@ define <2 x double> @extend_to_double_low_i16x4_u(<4 x i16> %x) {
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32x4.extend_low_i16x8_u
-; CHECK-NEXT: f64x2.convert_low_i32x4_u
+; CHECK-NEXT: f64x2.convert_low_i32x4_s
; CHECK-NEXT: # fallthrough-return
%low = shufflevector <4 x i16> %x, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
%extended = uitofp <2 x i16> %low to <2 x double>
|
403d9a2
to
bb563ac
Compare
|
convert_iKxN_s is canonicalized into convert_iKxN_u when the argument is known to have sign bit 0. This results in emitting Wasm opcodes that, on some targets (like x86_64), are dramatically slower than signed versions on major engines. Similarly to X86, we now fix this up in isel when the instruction has nonneg flag from canonicalization or if we know the source has zero sign bit.
convert_iKxN_s is canonicalized into convert_iKxN_u when the argument is known to have sign bit 0. This results in emitting Wasm opcodes that, on some targets (like x86_64), are dramatically slower than signed versions on major engines.
Similarly to X86, we now fix this up in isel when the instruction has nonneg flag from canonicalization or if we know the source has zero sign bit.
Fixes #149457.