diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index b2132562ac3f..92c6148cd7ca 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -399,6 +399,11 @@ class CombinerHelper { void applyCombineTruncOfExt(MachineInstr &MI, std::pair &MatchInfo); + /// Transform (shl (and x, imm1, imm2) to (shl x, imm2) + /// if (~imm1 << imm2) = 0 + bool matchCombineShlOfAnd(MachineInstr &MI, Register &Reg); + void applyCombineShlOfAnd(MachineInstr &MI, Register &Reg); + /// Transform trunc (shl x, K) to shl (trunc x), K /// if K < VT.getScalarSizeInBits(). /// diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 3c8d968c2764..d452a08e3c42 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -822,6 +822,15 @@ def trunc_ext_fold: GICombineRule < (apply [{ Helper.applyCombineTruncOfExt(*${root}, ${matchinfo}); }]) >; +// Under certain conditions, transform: +// Fold (shl (and x, imm1), imm2) -> (shl x, imm2) +def shl_and_fold: GICombineRule < + (defs root:$root, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_SHL):$root, + [{ return Helper.matchCombineShlOfAnd(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyCombineShlOfAnd(*${root}, ${matchinfo}); }]) +>; + // Under certain conditions, transform: // trunc (shl x, K) -> shl (trunc x), K// // trunc ([al]shr x, K) -> (trunc ([al]shr (trunc x), K)) @@ -1588,7 +1597,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero, bitcast_bitcast_fold, fptrunc_fpext_fold, right_identity_neg_zero_fp, right_identity_neg_one_fp, - combine_inttoptr_constant]>; + combine_inttoptr_constant, shl_and_fold]>; def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p, overlapping_and, mulo_by_2, mulo_by_0, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index ec7ca5dc8e2b..64baad5439ac 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2553,6 +2553,39 @@ void CombinerHelper::applyCombineTruncOfExt( MI.eraseFromParent(); } +bool CombinerHelper::matchCombineShlOfAnd(MachineInstr &MI, Register &Reg) { + // We're trying to match the following pattern: + // %t = G_AND %x, imm1 + // %root = G_SHL %t, imm2 + // --> + // %root = G_SHL %x, imm2 + // Where (~imm1 << imm2) = 0 + assert(MI.getOpcode() == TargetOpcode::G_SHL && "Expected a G_SHL"); + const Register DstReg = MI.getOperand(0).getReg(); + const Register SrcReg = MI.getOperand(1).getReg(); + const LLT SrcTy = MRI.getType(SrcReg); + const unsigned Size = SrcTy.getSizeInBits(); + + // Try to match shl (and x, imm1), imm2 + int64_t ShiftImm, AndImm; + if (!mi_match(DstReg, MRI, + m_GShl(m_OneNonDBGUse(m_GAnd(m_Reg(Reg), m_ICst(AndImm))), + m_ICst(ShiftImm)))) + return false; + // Check if AndImm has bits set only in positions that will be shifted out by + // ShiftImm. If any significant bits remain after the shift, the AND operation + // cannot be removed. + uint64_t Mask = ~0ULL >> (64 - Size); + return !((~AndImm << ShiftImm) & Mask); +} + +void CombinerHelper::applyCombineShlOfAnd(MachineInstr &MI, Register &Reg) { + assert(MI.getOpcode() == TargetOpcode::G_SHL && "Expected a G_SHL"); + Observer.changingInstr(MI); + MI.getOperand(1).setReg(Reg); + Observer.changedInstr(MI); +} + static LLT getMidVTForTruncRightShiftCombine(LLT ShiftTy, LLT TruncTy) { const unsigned ShiftSize = ShiftTy.getScalarSizeInBits(); const unsigned TruncSize = TruncTy.getScalarSizeInBits(); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shl-and.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shl-and.mir new file mode 100644 index 000000000000..3c83b5c165fc --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shl-and.mir @@ -0,0 +1,152 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_combine_shl_of_and_I16_shift_8 +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_shl_of_and_I16_shift_8 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[SHL]](s16) + ; CHECK-NEXT: $w0 = COPY [[SEXT]](s32) + %0:_(s32) = COPY $w0 + %1:_(s16) = G_CONSTANT i16 8 + %2:_(s16) = G_CONSTANT i16 255 + %3:_(s16) = G_TRUNC %0 + %4:_(s16) = G_AND %3, %2 + %5:_(s16) = G_SHL %4, %1 + %6:_(s32) = G_SEXT %5 + $w0 = COPY %6(s32) +... +--- +# Negative test case: Here we're trying to shift less than half size. +name: test_combine_shl_of_and_I16_shift_4_neg +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_shl_of_and_I16_shift_4_neg + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND]], [[C]](s16) + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[SHL]](s16) + ; CHECK-NEXT: $w0 = COPY [[SEXT]](s32) + %0:_(s32) = COPY $w0 + %1:_(s16) = G_CONSTANT i16 4 + %2:_(s16) = G_CONSTANT i16 15 + %3:_(s16) = G_TRUNC %0 + %4:_(s16) = G_AND %3, %2 + %5:_(s16) = G_SHL %4, %1 + %6:_(s32) = G_SEXT %5 + $w0 = COPY %6(s32) +... +--- +name: test_combine_shl_of_and_I32_shift_16 +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_shl_of_and_I32_shift_16 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK-NEXT: $w0 = COPY [[SHL]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 16 + %2:_(s32) = G_CONSTANT i32 65535 + %3:_(s32) = G_AND %0, %2 + %4:_(s32) = G_SHL %3, %1 + $w0 = COPY %4(s32) +... +--- +name: test_combine_shl_of_and_I32_shift_24 +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_shl_of_and_I32_shift_24 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK-NEXT: $w0 = COPY [[SHL]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 24 + %2:_(s32) = G_CONSTANT i32 16777215 + %3:_(s32) = G_AND %0, %2 + %4:_(s32) = G_SHL %3, %1 + $w0 = COPY %4(s32) +... +--- +# Negative test case: Here we're trying to shift less than half size. +name: test_combine_shl_of_and_I32_shift_8_neg +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_shl_of_and_I32_shift_8_neg + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]] + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32) + ; CHECK-NEXT: $w0 = COPY [[SHL]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 8 + %2:_(s32) = G_CONSTANT i32 255 + %3:_(s32) = G_AND %0, %2 + %4:_(s32) = G_SHL %3, %1 + $w0 = COPY %4(s32) +... +--- +# Negative test case: The AND and SHL operations cannot be combined because imm1 (255) and imm2 (16) do not satisfy the condition (~imm1 << imm2) = 0 +name: test_combine_shl_of_and_I32_shift_16_neg +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_shl_of_and_I32_shift_16_neg + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]] + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32) + ; CHECK-NEXT: $w0 = COPY [[SHL]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 16 + %2:_(s32) = G_CONSTANT i32 255 + %3:_(s32) = G_AND %0, %2 + %4:_(s32) = G_SHL %3, %1 + $w0 = COPY %4(s32) +... diff --git a/llvm/test/CodeGen/AIE/aie2/bfloat16_to_float.ll b/llvm/test/CodeGen/AIE/aie2/bfloat16_to_float.ll index 5a70d6286429..5e2a077b515d 100644 --- a/llvm/test/CodeGen/AIE/aie2/bfloat16_to_float.ll +++ b/llvm/test/CodeGen/AIE/aie2/bfloat16_to_float.ll @@ -14,10 +14,10 @@ define dso_local noundef float @bfloat16_to_float_test(%class.bfloat16 %bf.coerc ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv -; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mova r0, #16; extend.u16 r1, r1 // Delay Slot 2 +; CHECK-NEXT: mova r0, #16 // Delay Slot 2 ; CHECK-NEXT: lshl r0, r1, r0 // Delay Slot 1 entry: %bf.coerce.fca.0.extract = extractvalue %class.bfloat16 %bf.coerce, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index 496ee9f2dbb2..937911dcab15 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -262,11 +262,10 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_sub_i32 s1, s1, 64 ; GFX7-NEXT: s_sub_i32 s0, s0, 64 -; GFX7-NEXT: s_and_b32 s1, s1, 0xffff -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff +; GFX7-NEXT: s_add_i32 s1, s1, 0xffc00000 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; @@ -304,11 +303,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_add_i32 s1, s1, 4 ; GFX7-NEXT: s_sub_i32 s0, s0, 64 -; GFX7-NEXT: s_and_b32 s1, s1, 0xffff -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff +; GFX7-NEXT: s_add_i32 s1, s1, 0x40000 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; @@ -346,11 +344,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_sub_i32 s1, s1, 64 ; GFX7-NEXT: s_add_i32 s0, s0, 4 -; GFX7-NEXT: s_and_b32 s1, s1, 0xffff -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff +; GFX7-NEXT: s_add_i32 s1, s1, 0xffc00000 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; @@ -388,9 +385,8 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX7-LABEL: s_add_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_add_i32 s1, s1, s3 ; GFX7-NEXT: s_add_i32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s1, s1, 0xffff +; GFX7-NEXT: s_add_i32 s1, s1, s3 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -439,9 +435,8 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX7-NEXT: s_lshr_b32 s1, s0, 16 -; GFX7-NEXT: s_add_i32 s1, s1, s3 ; GFX7-NEXT: s_add_i32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s1, s1, 0xffff +; GFX7-NEXT: s_add_i32 s1, s1, s3 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -495,9 +490,8 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg ; GFX7-NEXT: s_or_b32 s2, s3, s2 ; GFX7-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX7-NEXT: s_lshr_b32 s3, s2, 16 -; GFX7-NEXT: s_add_i32 s1, s1, s3 ; GFX7-NEXT: s_add_i32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s1, s1, 0xffff +; GFX7-NEXT: s_add_i32 s1, s1, s3 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -556,11 +550,10 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha ; GFX7-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX7-NEXT: s_lshr_b32 s2, s0, 16 ; GFX7-NEXT: s_lshr_b32 s3, s1, 16 -; GFX7-NEXT: s_add_i32 s2, s2, s3 ; GFX7-NEXT: s_add_i32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s2, 0xffff +; GFX7-NEXT: s_add_i32 s2, s2, s3 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_lshl_b32 s1, s2, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll index 4be00fedb972..887d42e44125 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -688,11 +688,9 @@ define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff ; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: s_and_b32 s5, s5, 0xffff ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_or_b32 s6, s5, s6 ; GFX6-NEXT: s_and_b32 s7, s7, 0xffff @@ -741,11 +739,9 @@ define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1 define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v3i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff ; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: s_and_b32 s5, s5, 0xffff ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_or_b32 s6, s5, s6 ; GFX6-NEXT: s_and_b32 s7, s7, 0xffff @@ -794,7 +790,6 @@ define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inr define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v3i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff ; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: s_and_b32 s5, s5, 0xffff ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 @@ -802,9 +797,8 @@ define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 ; GFX6-NEXT: s_or_b32 s6, s5, s6 ; GFX6-NEXT: s_and_b32 s7, s7, 0xffff ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] -; GFX6-NEXT: s_and_b32 s1, s3, 0xffff ; GFX6-NEXT: s_and_b32 s0, s2, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s4, 0xffff ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] @@ -866,10 +860,8 @@ define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) { ; GFX6-LABEL: v_andn2_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index ec832ed0f7f3..70d4265c489e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -815,11 +815,10 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) { define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: s_ashr_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: s_ashr_i32 s1, s1, s3 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_ashr_i32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_ashr_i32 s1, s1, s3 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -873,7 +872,6 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_sext_i32_i16 s0, s1 ; GFX6-NEXT: v_ashr_i32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -911,7 +909,6 @@ define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -966,14 +963,12 @@ define <2 x float> @v_ashr_v4i16(<4 x i16> %value, <4 x i16> %amount) { ; GFX6-NEXT: v_ashrrev_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1009,21 +1004,19 @@ define <2 x float> @v_ashr_v4i16(<4 x i16> %value, <4 x i16> %amount) { define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) { ; GFX6-LABEL: s_ashr_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: s_ashr_i32 s1, s1, s5 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_ashr_i32 s0, s0, s4 +; GFX6-NEXT: s_ashr_i32 s1, s1, s5 ; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_sext_i32_i16 s3, s3 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, s6 ; GFX6-NEXT: s_ashr_i32 s3, s3, s7 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; @@ -1133,7 +1126,6 @@ define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) { ; GFX6-NEXT: v_ashrrev_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v13 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, v8, v5 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v14 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 @@ -1144,17 +1136,14 @@ define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) { ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, v8, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1200,37 +1189,33 @@ define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) { define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) { ; GFX6-LABEL: s_ashr_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: s_ashr_i32 s1, s1, s9 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_ashr_i32 s0, s0, s8 +; GFX6-NEXT: s_ashr_i32 s1, s1, s9 ; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_sext_i32_i16 s3, s3 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, s10 ; GFX6-NEXT: s_ashr_i32 s3, s3, s11 +; GFX6-NEXT: s_sext_i32_i16 s4, s4 ; GFX6-NEXT: s_sext_i32_i16 s5, s5 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_sext_i32_i16 s4, s4 +; GFX6-NEXT: s_ashr_i32 s4, s4, s12 ; GFX6-NEXT: s_ashr_i32 s5, s5, s13 +; GFX6-NEXT: s_sext_i32_i16 s6, s6 ; GFX6-NEXT: s_sext_i32_i16 s7, s7 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_ashr_i32 s4, s4, s12 -; GFX6-NEXT: s_sext_i32_i16 s6, s6 -; GFX6-NEXT: s_ashr_i32 s7, s7, s15 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_ashr_i32 s6, s6, s14 +; GFX6-NEXT: s_ashr_i32 s7, s7, s15 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s5, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir index 6a291510fe66..c5a5b3e4750d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir @@ -113,10 +113,8 @@ body: | ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: %narrow:_(s32) = COPY $vgpr0 - ; GFX6-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823 - ; GFX6-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30 ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32) ; GFX6-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(s64) ; @@ -124,10 +122,8 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: %narrow:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823 - ; GFX9-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32) ; GFX9-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(s64) %narrow:_(s32) = COPY $vgpr0 @@ -151,10 +147,8 @@ body: | ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: %narrow:_(s32) = COPY $vgpr0 - ; GFX6-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823 - ; GFX6-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30 ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32) ; GFX6-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(s64) ; @@ -162,10 +156,8 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: %narrow:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823 - ; GFX9-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32) ; GFX9-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(s64) %narrow:_(s32) = COPY $vgpr0 @@ -189,10 +181,8 @@ body: | ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: %narrow:_(s32) = COPY $vgpr0 - ; GFX6-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823 - ; GFX6-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30 ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32) ; GFX6-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(s64) ; @@ -200,10 +190,8 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: %narrow:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823 - ; GFX9-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32) ; GFX9-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(s64) %narrow:_(s32) = COPY $vgpr0 @@ -227,10 +215,8 @@ body: | ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: %narrow:_(s32) = COPY $vgpr0 - ; GFX6-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823 - ; GFX6-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30 ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32) ; GFX6-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(s64) ; @@ -238,10 +224,8 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: %narrow:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823 - ; GFX9-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32) ; GFX9-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(s64) %narrow:_(s32) = COPY $vgpr0 @@ -280,10 +264,8 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0 ; GFX9-NEXT: %narrow:_(s16) = G_TRUNC %argument(s32) - ; GFX9-NEXT: %masklow14:_(s16) = G_CONSTANT i16 16383 - ; GFX9-NEXT: %masked:_(s16) = G_AND %narrow, %masklow14 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %narrow, [[C]](s16) ; GFX9-NEXT: %shl:_(s32) = G_ZEXT [[SHL]](s16) ; GFX9-NEXT: $vgpr0 = COPY %shl(s32) %argument:_(s32) = COPY $vgpr0 @@ -321,10 +303,8 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0 ; GFX9-NEXT: %narrow:_(s16) = G_TRUNC %argument(s32) - ; GFX9-NEXT: %masklow14:_(s16) = G_CONSTANT i16 16383 - ; GFX9-NEXT: %masked:_(s16) = G_AND %narrow, %masklow14 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %narrow, [[C]](s16) ; GFX9-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s16) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(s64) %argument:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir index 6ceb41199af6..fff976e1042b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir @@ -14,10 +14,8 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: %argument:_(s32) = COPY $vgpr0 ; GFX6-NEXT: %narrow:_(s16) = G_TRUNC %argument(s32) - ; GFX6-NEXT: %masklow14:_(s16) = G_CONSTANT i16 16383 - ; GFX6-NEXT: %masked:_(s16) = G_AND %narrow, %masklow14 ; GFX6-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 - ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %narrow, [[C]](s16) ; GFX6-NEXT: %shl:_(s32) = G_ZEXT [[SHL]](s16) ; GFX6-NEXT: $vgpr0 = COPY %shl(s32) ; @@ -26,10 +24,8 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0 ; GFX9-NEXT: %narrow:_(s16) = G_TRUNC %argument(s32) - ; GFX9-NEXT: %masklow14:_(s16) = G_CONSTANT i16 16383 - ; GFX9-NEXT: %masked:_(s16) = G_AND %narrow, %masklow14 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %narrow, [[C]](s16) ; GFX9-NEXT: %shl:_(s32) = G_ZEXT [[SHL]](s16) ; GFX9-NEXT: $vgpr0 = COPY %shl(s32) %argument:_(s32) = COPY $vgpr0 @@ -54,10 +50,8 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: %argument:_(s32) = COPY $vgpr0 ; GFX6-NEXT: %narrow:_(s16) = G_TRUNC %argument(s32) - ; GFX6-NEXT: %masklow14:_(s16) = G_CONSTANT i16 16383 - ; GFX6-NEXT: %masked:_(s16) = G_AND %narrow, %masklow14 ; GFX6-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 - ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %narrow, [[C]](s16) ; GFX6-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s16) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(s64) ; @@ -66,10 +60,8 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0 ; GFX9-NEXT: %narrow:_(s16) = G_TRUNC %argument(s32) - ; GFX9-NEXT: %masklow14:_(s16) = G_CONSTANT i16 16383 - ; GFX9-NEXT: %masked:_(s16) = G_AND %narrow, %masklow14 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %narrow, [[C]](s16) ; GFX9-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s16) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(s64) %argument:_(s32) = COPY $vgpr0 @@ -94,10 +86,8 @@ body: | ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: %argument:_(s32) = COPY $vgpr0 ; GFX6-NEXT: %narrow:_(s8) = G_TRUNC %argument(s32) - ; GFX6-NEXT: %masklow6:_(s8) = G_CONSTANT i8 63 - ; GFX6-NEXT: %masked:_(s8) = G_AND %narrow, %masklow6 ; GFX6-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 2 - ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s8) = G_SHL %masked, [[C]](s8) + ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s8) = G_SHL %narrow, [[C]](s8) ; GFX6-NEXT: %result:_(s32) = G_ZEXT [[SHL]](s8) ; GFX6-NEXT: $vgpr0 = COPY %result(s32) ; @@ -106,10 +96,8 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0 ; GFX9-NEXT: %narrow:_(s8) = G_TRUNC %argument(s32) - ; GFX9-NEXT: %masklow6:_(s8) = G_CONSTANT i8 63 - ; GFX9-NEXT: %masked:_(s8) = G_AND %narrow, %masklow6 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 2 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s8) = G_SHL %masked, [[C]](s8) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s8) = G_SHL %narrow, [[C]](s8) ; GFX9-NEXT: %result:_(s32) = G_ZEXT [[SHL]](s8) ; GFX9-NEXT: $vgpr0 = COPY %result(s32) %argument:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index 02781e763f44..0416d7046418 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -751,18 +751,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v8 -; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v9 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 ; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x9000000, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -789,20 +788,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 ; VI-NEXT: v_add_u16_e32 v9, 9, v1 -; VI-NEXT: v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v10, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 9, v7 ; VI-NEXT: v_add_u16_e32 v8, 9, v8 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; VI-NEXT: s_nop 0 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_and_b32_e32 v1, 0xff, v8 ; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v10 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v2, v0, v2 +; VI-NEXT: v_or_b32_e32 v2, v0, v10 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index f9b98059be0b..53db0574ae48 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -709,7 +709,6 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -736,7 +735,6 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -745,25 +743,24 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX10-LABEL: s_fshl_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_lshr_b32 s5, s2, 8 -; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_and_b32 s6, s2, 7 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshr_b32 s5, s2, 8 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s6, s5, 7 -; GFX10-NEXT: s_andn2_b32 s5, 7, s5 -; GFX10-NEXT: s_lshr_b32 s4, s4, 1 +; GFX10-NEXT: s_and_b32 s6, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_lshr_b32 s4, s4, s5 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_or_b32 s2, s3, s4 +; GFX10-NEXT: s_and_b32 s2, s4, 0xff +; GFX10-NEXT: s_and_b32 s4, s5, 7 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_andn2_b32 s5, 7, s5 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s3, s3, s4 +; GFX10-NEXT: s_lshr_b32 s2, s2, s5 ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s1, s2, 0xff +; GFX10-NEXT: s_or_b32 s1, s3, s2 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -772,25 +769,24 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX11-LABEL: s_fshl_v2i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s4, s1, 8 -; GFX11-NEXT: s_lshr_b32 s5, s2, 8 -; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s6, s2, 7 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-NEXT: s_lshr_b32 s5, s2, 8 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, s6 -; GFX11-NEXT: s_and_b32 s6, s5, 7 -; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 -; GFX11-NEXT: s_lshr_b32 s4, s4, 1 +; GFX11-NEXT: s_and_b32 s6, s2, 7 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: s_lshl_b32 s3, s3, s6 -; GFX11-NEXT: s_lshr_b32 s4, s4, s5 +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 -; GFX11-NEXT: s_or_b32 s2, s3, s4 +; GFX11-NEXT: s_and_b32 s2, s4, 0xff +; GFX11-NEXT: s_and_b32 s4, s5, 7 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, s6 +; GFX11-NEXT: s_lshl_b32 s3, s3, s4 +; GFX11-NEXT: s_lshr_b32 s2, s2, s5 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_or_b32 s1, s3, s2 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -851,9 +847,7 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v1, v3 ; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -877,65 +871,59 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v1, v3 ; GFX9-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v3 -; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX10-NEXT: v_not_b32_e32 v7, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_not_b32_e32 v6, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_not_b32_e32 v7, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_lshrrev_b16 v4, 1, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v3, v3, v5 +; GFX10-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_lshrrev_b16 v5, 1, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: v_lshrrev_b16 v4, v6, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xff +; GFX10-NEXT: v_lshrrev_b16 v1, v6, v1 +; GFX10-NEXT: v_lshlrev_b16 v2, v4, v3 +; GFX10-NEXT: v_lshrrev_b16 v3, v7, v5 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v2i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX11-NEXT: v_not_b32_e32 v7, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_not_b32_e32 v6, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX11-NEXT: v_not_b32_e32 v7, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX11-NEXT: v_lshrrev_b16 v4, 1, v4 ; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX11-NEXT: v_lshlrev_b16 v3, v3, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX11-NEXT: v_lshrrev_b16 v5, 1, v5 ; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b16 v4, v6, v4 -; GFX11-NEXT: v_lshrrev_b16 v1, v7, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-NEXT: v_lshrrev_b16 v1, v6, v1 +; GFX11-NEXT: v_lshlrev_b16 v2, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b16 v3, v7, v5 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 @@ -977,20 +965,19 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX6-NEXT: s_andn2_b32 s6, 7, s7 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_lshr_b32 s4, s4, s6 +; GFX6-NEXT: s_and_b32 s2, s2, 0xff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_and_b32 s4, s8, 7 ; GFX6-NEXT: s_andn2_b32 s6, 7, s8 ; GFX6-NEXT: s_lshr_b32 s1, s1, 25 -; GFX6-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NEXT: s_lshl_b32 s4, s5, s4 -; GFX6-NEXT: s_lshr_b32 s1, s1, s6 ; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_or_b32 s1, s4, s1 +; GFX6-NEXT: s_lshl_b32 s4, s5, s4 +; GFX6-NEXT: s_lshr_b32 s1, s1, s6 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s3, 0xff +; GFX6-NEXT: s_or_b32 s1, s4, s1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -1044,8 +1031,7 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s3, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1097,8 +1083,7 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_or_b32 s3, s3, s4 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s3, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; @@ -1108,48 +1093,47 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_and_b32 s11, s2, 7 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_and_b32 s12, s2, 7 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_andn2_b32 s12, 7, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s2, s6, 0xff -; GFX10-NEXT: s_and_b32 s6, s9, 7 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_andn2_b32 s9, 7, s9 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s12 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_lshr_b32 s2, s2, s9 +; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s11 +; GFX10-NEXT: s_lshr_b32 s1, s1, s12 +; GFX10-NEXT: s_lshr_b32 s10, s2, 16 ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_or_b32 s1, s3, s2 -; GFX10-NEXT: s_and_b32 s2, s7, 0xff -; GFX10-NEXT: s_and_b32 s3, s10, 7 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_andn2_b32 s6, 7, s10 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_and_b32 s4, s11, 7 -; GFX10-NEXT: s_andn2_b32 s6, 7, s11 -; GFX10-NEXT: s_lshr_b32 s7, s8, 1 -; GFX10-NEXT: s_lshl_b32 s4, s5, s4 -; GFX10-NEXT: s_lshr_b32 s5, s7, s6 -; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s1, s6, 0xff +; GFX10-NEXT: s_and_b32 s6, s9, 7 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s3, s3, s6 +; GFX10-NEXT: s_and_b32 s6, s7, 0xff +; GFX10-NEXT: s_andn2_b32 s9, 7, s9 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX10-NEXT: s_lshr_b32 s1, s1, s9 +; GFX10-NEXT: s_and_b32 s7, s10, 7 +; GFX10-NEXT: s_andn2_b32 s9, 7, s10 +; GFX10-NEXT: s_lshr_b32 s6, s6, 1 +; GFX10-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-NEXT: s_lshl_b32 s4, s4, s7 +; GFX10-NEXT: s_lshr_b32 s6, s6, s9 +; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: s_or_b32 s3, s4, s6 +; GFX10-NEXT: s_and_b32 s4, s2, 7 +; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshr_b32 s6, s8, 1 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_or_b32 s3, s4, s5 +; GFX10-NEXT: s_lshl_b32 s4, s5, s4 +; GFX10-NEXT: s_lshr_b32 s2, s6, s2 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_and_b32 s3, s3, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s1, s3, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -1161,48 +1145,47 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX11-NEXT: s_lshr_b32 s7, s1, 16 ; GFX11-NEXT: s_lshr_b32 s8, s1, 24 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-NEXT: s_and_b32 s11, s2, 7 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshr_b32 s10, s2, 16 -; GFX11-NEXT: s_lshr_b32 s11, s2, 24 -; GFX11-NEXT: s_and_b32 s12, s2, 7 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_not1_b32 s12, 7, s2 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s1, s1, s2 -; GFX11-NEXT: s_and_b32 s2, s6, 0xff -; GFX11-NEXT: s_and_b32 s6, s9, 7 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s0, 24 -; GFX11-NEXT: s_lshl_b32 s0, s0, s12 -; GFX11-NEXT: s_lshl_b32 s3, s3, s6 -; GFX11-NEXT: s_lshr_b32 s2, s2, s9 +; GFX11-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, s11 +; GFX11-NEXT: s_lshr_b32 s1, s1, s12 +; GFX11-NEXT: s_lshr_b32 s10, s2, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s3, s2 -; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_and_b32 s3, s10, 7 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_and_not1_b32 s6, 7, s10 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_lshl_b32 s3, s4, s3 -; GFX11-NEXT: s_lshr_b32 s2, s2, s6 -; GFX11-NEXT: s_and_b32 s4, s11, 7 -; GFX11-NEXT: s_and_not1_b32 s6, 7, s11 -; GFX11-NEXT: s_lshr_b32 s7, s8, 1 -; GFX11-NEXT: s_lshl_b32 s4, s5, s4 -; GFX11-NEXT: s_lshr_b32 s5, s7, s6 -; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s1, s6, 0xff +; GFX11-NEXT: s_and_b32 s6, s9, 7 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s3, s3, s6 +; GFX11-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-NEXT: s_lshr_b32 s1, s1, s9 +; GFX11-NEXT: s_and_b32 s7, s10, 7 +; GFX11-NEXT: s_and_not1_b32 s9, 7, s10 +; GFX11-NEXT: s_lshr_b32 s6, s6, 1 +; GFX11-NEXT: s_lshr_b32 s2, s2, 24 +; GFX11-NEXT: s_lshl_b32 s4, s4, s7 +; GFX11-NEXT: s_lshr_b32 s6, s6, s9 +; GFX11-NEXT: s_or_b32 s1, s3, s1 +; GFX11-NEXT: s_or_b32 s3, s4, s6 +; GFX11-NEXT: s_and_b32 s4, s2, 7 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_lshr_b32 s6, s8, 1 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: s_lshl_b32 s4, s5, s4 +; GFX11-NEXT: s_lshr_b32 s2, s6, s2 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s1, s2, 16 -; GFX11-NEXT: s_and_b32 s2, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s3, 16 +; GFX11-NEXT: s_or_b32 s2, s4, s2 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_lshl_b32 s1, s2, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1249,20 +1232,19 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v8 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v6, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v6, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1306,15 +1288,13 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v6, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1356,67 +1336,63 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v6, v1 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_not_b32_e32 v9, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 -; GFX10-NEXT: v_not_b32_e32 v10, v8 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0 +; GFX10-NEXT: v_not_b32_e32 v9, v10 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-NEXT: v_mov_b32_e32 v13, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_and_b32_e32 v12, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 -; GFX10-NEXT: v_not_b32_e32 v8, v11 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_not_b32_e32 v13, v2 ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_not_b32_e32 v14, v11 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_not_b32_e32 v8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 +; GFX10-NEXT: v_lshrrev_b16 v6, v9, v6 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v11 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v14 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 -; GFX10-NEXT: v_lshrrev_b16 v6, v10, v6 -; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1 -; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 -; GFX10-NEXT: v_lshrrev_b16 v5, v13, v7 -; GFX10-NEXT: v_lshrrev_b16 v7, v9, v12 +; GFX10-NEXT: v_not_b32_e32 v11, v2 +; GFX10-NEXT: v_lshlrev_b16 v4, v9, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, 8 +; GFX10-NEXT: v_lshrrev_b16 v8, v8, v12 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v11 +; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 +; GFX10-NEXT: v_lshrrev_b16 v4, v6, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1425,55 +1401,53 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v12, 7, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-NEXT: v_not_b32_e32 v13, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-NEXT: v_not_b32_e32 v11, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_lshlrev_b16 v0, v12, v0 ; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX11-NEXT: v_and_b32_e32 v12, 7, v13 ; GFX11-NEXT: v_lshrrev_b16 v6, 1, v6 -; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX11-NEXT: v_not_b32_e32 v13, v10 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b16 v3, v9, v3 -; GFX11-NEXT: v_not_b32_e32 v9, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6 -; GFX11-NEXT: v_not_b32_e32 v13, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX11-NEXT: v_and_b32_e32 v12, 7, v2 -; GFX11-NEXT: v_not_b32_e32 v2, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX11-NEXT: v_lshlrev_b16 v3, v9, v3 +; GFX11-NEXT: v_lshrrev_b16 v6, v12, v6 +; GFX11-NEXT: v_and_b32_e32 v9, 7, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 7, v13 ; GFX11-NEXT: v_lshrrev_b16 v7, 1, v7 +; GFX11-NEXT: v_not_b32_e32 v12, v2 ; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX11-NEXT: v_lshrrev_b16 v8, 1, v8 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX11-NEXT: v_lshlrev_b16 v4, v10, v4 -; GFX11-NEXT: v_lshrrev_b16 v6, v9, v7 -; GFX11-NEXT: v_lshlrev_b16 v5, v11, v5 -; GFX11-NEXT: v_lshrrev_b16 v7, v13, v8 -; GFX11-NEXT: v_lshlrev_b16 v0, v12, v0 -; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v6 -; GFX11-NEXT: v_or_b32_e32 v4, v5, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b16 v4, v9, v4 +; GFX11-NEXT: v_lshrrev_b16 v6, v10, v7 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v12 +; GFX11-NEXT: v_lshrrev_b16 v8, 1, v8 +; GFX11-NEXT: v_lshrrev_b16 v1, v11, v1 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v2, v2, v5 +; GFX11-NEXT: v_lshrrev_b16 v5, v7, v8 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -1911,8 +1885,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_bfe_u32 v2, v0, 8, 8 ; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 @@ -1925,7 +1898,6 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-LABEL: s_fshl_v2i24: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 -; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff @@ -1933,47 +1905,43 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff -; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s6 ; GFX8-NEXT: s_lshr_b32 s6, s2, 8 -; GFX8-NEXT: s_and_b32 s6, s6, 0xff +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 24 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff -; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: s_lshr_b32 s9, s3, 8 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s3, s3, 8 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff -; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX8-NEXT: s_or_b32 s3, s8, s3 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s6 ; GFX8-NEXT: s_lshr_b32 s6, s4, 8 -; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_lshr_b32 s7, s4, 16 ; GFX8-NEXT: s_lshr_b32 s8, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff @@ -1988,10 +1956,9 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX8-NEXT: s_lshr_b32 s9, s5, 8 -; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff +; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX8-NEXT: s_or_b32 s5, s8, s5 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 @@ -2033,8 +2000,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -2045,7 +2011,6 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-LABEL: s_fshl_v2i24: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s6, s0, 8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xff ; GFX9-NEXT: s_lshr_b32 s7, s0, 16 ; GFX9-NEXT: s_lshr_b32 s8, s0, 24 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff @@ -2053,47 +2018,43 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: s_or_b32 s0, s0, s6 ; GFX9-NEXT: s_and_b32 s6, s7, 0xff ; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: s_lshr_b32 s9, s1, 8 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_or_b32 s0, s0, s6 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_and_b32 s6, s9, 0xff -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: s_or_b32 s1, s8, s1 ; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s1, s1, s6 ; GFX9-NEXT: s_lshr_b32 s6, s2, 8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: s_lshr_b32 s7, s2, 16 ; GFX9-NEXT: s_lshr_b32 s8, s2, 24 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_or_b32 s2, s2, s6 ; GFX9-NEXT: s_and_b32 s6, s7, 0xff -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX9-NEXT: s_lshr_b32 s9, s3, 8 ; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_and_b32 s3, s3, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_or_b32 s2, s2, s6 ; GFX9-NEXT: s_lshl_b32 s3, s3, 8 ; GFX9-NEXT: s_and_b32 s6, s9, 0xff -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX9-NEXT: s_or_b32 s3, s8, s3 ; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: s_or_b32 s3, s3, s6 ; GFX9-NEXT: s_lshr_b32 s6, s4, 8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xff ; GFX9-NEXT: s_lshr_b32 s7, s4, 16 ; GFX9-NEXT: s_lshr_b32 s8, s4, 24 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff @@ -2108,7 +2069,6 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX9-NEXT: s_lshr_b32 s9, s5, 8 -; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 ; GFX9-NEXT: s_and_b32 s6, s9, 0xff ; GFX9-NEXT: s_or_b32 s5, s8, s5 @@ -2151,8 +2111,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_and_or_b32 v2, v1, v2, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_bfe_u32 v2, v0, 8, 8 ; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 8 @@ -2164,116 +2123,109 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-LABEL: s_fshl_v2i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_and_b32 s6, s6, 0xff -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s6, s7, 0xff -; GFX10-NEXT: s_lshr_b32 s7, s4, 8 -; GFX10-NEXT: s_lshr_b32 s11, s4, 24 -; GFX10-NEXT: s_and_b32 s7, s7, 0xff +; GFX10-NEXT: s_lshr_b32 s14, s4, 8 +; GFX10-NEXT: s_lshr_b32 s15, s4, 16 +; GFX10-NEXT: s_lshr_b32 s16, s4, 24 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: s_lshl_b32 s7, s7, 8 -; GFX10-NEXT: s_lshr_b32 s12, s5, 8 -; GFX10-NEXT: s_or_b32 s4, s4, s7 -; GFX10-NEXT: s_and_b32 s7, s10, 0xff -; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: s_lshl_b32 s7, s7, 16 -; GFX10-NEXT: s_and_b32 s5, s5, 0xff -; GFX10-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 -; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_and_b32 s15, s15, 0xff +; GFX10-NEXT: s_or_b32 s4, s4, s14 +; GFX10-NEXT: s_and_b32 s14, 0xffff, s15 +; GFX10-NEXT: s_lshr_b32 s17, s5, 8 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_and_b32 s7, s12, 0xff -; GFX10-NEXT: s_or_b32 s5, s11, s5 -; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX10-NEXT: s_lshl_b32 s14, s14, 16 +; GFX10-NEXT: s_or_b32 s5, s16, s5 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: s_and_b32 s16, s17, 0xff +; GFX10-NEXT: s_or_b32 s4, s4, s14 +; GFX10-NEXT: s_and_b32 s15, 0xffff, s16 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX10-NEXT: s_lshl_b32 s7, s7, 16 -; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: s_lshl_b32 s14, s15, 16 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_or_b32 s5, s5, s14 +; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: s_lshr_b32 s9, s1, 8 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_and_b32 s7, s9, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 -; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: s_lshr_b32 s8, s2, 8 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: s_and_b32 s8, s8, 0xff +; GFX10-NEXT: s_lshr_b32 s10, s2, 8 +; GFX10-NEXT: s_lshr_b32 s11, s2, 16 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: s_lshr_b32 s12, s2, 24 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: v_mul_hi_u32 v1, s4, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_or_b32 s1, s8, s1 +; GFX10-NEXT: s_and_b32 s8, s9, 0xff +; GFX10-NEXT: s_lshl_b32 s9, s10, 8 +; GFX10-NEXT: s_and_b32 s10, s11, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: s_or_b32 s2, s2, s9 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s10 +; GFX10-NEXT: s_lshr_b32 s13, s3, 8 ; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX10-NEXT: s_lshl_b32 s6, s6, 16 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s7, s7, 16 -; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_or_b32 s1, s1, s7 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s4, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0 -; GFX10-NEXT: s_lshr_b32 s4, s3, 8 -; GFX10-NEXT: s_and_b32 s5, s9, 0xff -; GFX10-NEXT: s_and_b32 s3, s3, 0xff -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_lshl_b32 s5, s5, 16 -; GFX10-NEXT: s_or_b32 s3, s10, s3 -; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_and_b32 s7, s7, 0xff +; GFX10-NEXT: s_and_b32 s11, s13, 0xff +; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_or_b32 s3, s12, s3 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, 24 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s11 ; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 24, v0 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s4 +; GFX10-NEXT: s_or_b32 s3, s3, s7 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s8 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 24, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX10-NEXT: s_lshr_b32 s2, s3, 1 -; GFX10-NEXT: v_lshl_or_b32 v1, s0, v1, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2 -; GFX10-NEXT: v_mov_b32_e32 v2, 8 -; GFX10-NEXT: v_lshl_or_b32 v0, s1, v0, v3 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: s_lshr_b32 s2, s3, 1 +; GFX10-NEXT: v_lshl_or_b32 v2, s0, v2, v3 +; GFX10-NEXT: v_lshrrev_b32_e64 v4, v4, s2 +; GFX10-NEXT: s_lshl_b32 s2, s5, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, 0xff, v1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v4 -; GFX10-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX10-NEXT: s_or_b32 s0, s1, s2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX10-NEXT: v_or3_b32 v1, v1, v3, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v4 +; GFX10-NEXT: v_or3_b32 v1, v1, v3, v4 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v5 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -2281,130 +2233,124 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-LABEL: s_fshl_v2i24: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX11-NEXT: s_lshr_b32 s14, s4, 8 +; GFX11-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-NEXT: s_lshr_b32 s16, s4, 24 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-NEXT: s_and_b32 s15, s15, 0xff +; GFX11-NEXT: s_or_b32 s4, s4, s14 +; GFX11-NEXT: s_and_b32 s14, 0xffff, s15 +; GFX11-NEXT: s_lshr_b32 s17, s5, 8 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_lshl_b32 s14, s14, 16 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s14 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: s_and_b32 s14, s17, 0xff +; GFX11-NEXT: s_or_b32 s5, s16, s5 +; GFX11-NEXT: s_and_b32 s14, 0xffff, s14 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_lshl_b32 s14, s14, 16 +; GFX11-NEXT: s_lshr_b32 s8, s0, 24 +; GFX11-NEXT: s_or_b32 s5, s5, s14 +; GFX11-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_lshr_b32 s10, s2, 8 +; GFX11-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-NEXT: s_lshr_b32 s12, s2, 24 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s1, s8, s1 +; GFX11-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: s_lshl_b32 s9, s10, 8 +; GFX11-NEXT: s_and_b32 s10, s11, 0xff +; GFX11-NEXT: s_or_b32 s2, s2, s9 ; GFX11-NEXT: s_lshr_b32 s6, s0, 8 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_lshr_b32 s7, s0, 16 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_lshr_b32 s8, s0, 24 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_lshr_b32 s13, s3, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s6 +; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s7 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-NEXT: s_lshr_b32 s7, s4, 16 +; GFX11-NEXT: s_or_b32 s3, s12, s3 ; GFX11-NEXT: s_or_b32 s0, s0, s6 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_lshr_b32 s6, s4, 8 -; GFX11-NEXT: s_lshr_b32 s10, s4, 24 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: s_lshr_b32 s11, s5, 8 -; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-NEXT: s_and_b32 s5, s5, 0xff -; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: s_lshl_b32 s5, s5, 8 -; GFX11-NEXT: s_and_b32 s6, s11, 0xff -; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX11-NEXT: s_or_b32 s5, s10, s5 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX11-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-NEXT: s_lshr_b32 s9, s1, 8 -; GFX11-NEXT: s_or_b32 s5, s5, s6 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_lshr_b32 s7, s2, 8 -; GFX11-NEXT: s_or_b32 s1, s8, s1 -; GFX11-NEXT: s_lshr_b32 s8, s2, 16 -; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0 -; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX11-NEXT: s_and_b32 s7, s7, 0xff -; GFX11-NEXT: s_and_b32 s6, s9, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s2, 24 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s7, 8 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX11-NEXT: s_or_b32 s2, s2, s7 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s8 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0 -; GFX11-NEXT: s_and_b32 s5, s8, 0xff -; GFX11-NEXT: s_lshr_b32 s4, s3, 8 -; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 -; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_and_b32 s5, 0xffff, s10 +; GFX11-NEXT: s_and_b32 s4, s13, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s5 -; GFX11-NEXT: s_or_b32 s3, s9, s3 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX11-NEXT: s_or_b32 s2, s2, s5 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: s_or_b32 s3, s3, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_lshr_b32 s3, s3, 1 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e64 v2, v2, s2 -; GFX11-NEXT: s_lshl_b32 s2, s6, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshl_or_b32 v1, s0, v1, v2 -; GFX11-NEXT: s_or_b32 s0, s1, s2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 -; GFX11-NEXT: v_lshrrev_b32_e64 v3, v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshrrev_b32_e64 v2, v2, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s4 +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v3 -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, s0, v1, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX11-NEXT: s_or_b32 s0, s1, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0 +; GFX11-NEXT: v_lshrrev_b32_e64 v2, v3, s2 +; GFX11-NEXT: v_bfe_u32 v3, v1, 8, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v0 ; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v3 ; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX11-NEXT: v_or3_b32 v1, v1, v2, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5 -; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 ; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i48 %lhs.arg to <2 x i24> %rhs = bitcast i48 %rhs.arg to <2 x i24> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index c8455665e7b4..cd5375886768 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -706,7 +706,6 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s1, s3, s1 ; GFX8-NEXT: s_or_b32 s1, s2, s1 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -733,7 +732,6 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshr_b32 s1, s3, s1 ; GFX9-NEXT: s_or_b32 s1, s2, s1 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -746,21 +744,20 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX10-NEXT: s_lshr_b32 s5, s2, 8 ; GFX10-NEXT: s_and_b32 s6, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshr_b32 s1, s1, s6 ; GFX10-NEXT: s_lshl_b32 s3, s3, s5 ; GFX10-NEXT: s_lshr_b32 s2, s4, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s6 -; GFX10-NEXT: s_or_b32 s2, s3, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s1, s2, 0xff +; GFX10-NEXT: s_or_b32 s1, s3, s2 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -773,21 +770,20 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX11-NEXT: s_lshr_b32 s5, s2, 8 ; GFX11-NEXT: s_and_b32 s6, s2, 7 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s5, 7 ; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 ; GFX11-NEXT: s_lshl_b32 s3, s3, 1 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshr_b32 s1, s1, s6 ; GFX11-NEXT: s_lshl_b32 s3, s3, s5 ; GFX11-NEXT: s_lshr_b32 s2, s4, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s6 -; GFX11-NEXT: s_or_b32 s2, s3, s2 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_or_b32 s1, s3, s2 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -848,9 +844,7 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v3 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -873,65 +867,59 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v3 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: v_not_b32_e32 v7, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 -; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX10-NEXT: v_not_b32_e32 v6, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_not_b32_e32 v7, v4 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5 -; GFX10-NEXT: v_lshlrev_b16 v4, v7, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v6, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: v_or_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xff +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v6, v0 +; GFX10-NEXT: v_lshlrev_b16 v2, v7, v3 +; GFX10-NEXT: v_lshrrev_b16 v3, v4, v5 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v2i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 -; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_not_b32_e32 v7, v4 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: v_lshrrev_b16 v3, v3, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX11-NEXT: v_lshlrev_b16 v3, 1, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, v6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b16 v4, v6, v4 -; GFX11-NEXT: v_lshrrev_b16 v1, v7, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX11-NEXT: v_or_b32_e32 v2, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v2, v7, v3 +; GFX11-NEXT: v_lshrrev_b16 v3, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 @@ -990,8 +978,7 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, 0xff -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_lshl_b32 s1, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -1043,8 +1030,7 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s3, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1096,59 +1082,57 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_or_b32 s3, s4, s3 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshl_b32 s1, s3, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_lshr_b32 s5, s0, 24 ; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_lshr_b32 s9, s2, 8 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_and_b32 s12, s2, 7 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_and_b32 s11, s2, 7 +; GFX10-NEXT: s_andn2_b32 s12, 7, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s2, s9, 7 +; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s12 +; GFX10-NEXT: s_lshr_b32 s1, s1, s11 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff +; GFX10-NEXT: s_lshr_b32 s10, s2, 16 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s9, 7 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 ; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX10-NEXT: s_lshr_b32 s1, s1, s12 +; GFX10-NEXT: s_and_b32 s7, s7, 0xff ; GFX10-NEXT: s_lshl_b32 s3, s3, s9 -; GFX10-NEXT: s_lshr_b32 s2, s6, s2 -; GFX10-NEXT: s_and_b32 s6, s7, 0xff -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_or_b32 s1, s3, s2 -; GFX10-NEXT: s_and_b32 s2, s10, 7 -; GFX10-NEXT: s_andn2_b32 s3, 7, s10 +; GFX10-NEXT: s_lshr_b32 s1, s6, s1 +; GFX10-NEXT: s_and_b32 s6, s10, 7 +; GFX10-NEXT: s_andn2_b32 s9, 7, s10 ; GFX10-NEXT: s_lshl_b32 s4, s4, 1 -; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s2, s6, s2 -; GFX10-NEXT: s_andn2_b32 s4, 7, s11 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX10-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-NEXT: s_lshl_b32 s4, s4, s9 +; GFX10-NEXT: s_lshr_b32 s6, s7, s6 +; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: s_or_b32 s3, s4, s6 +; GFX10-NEXT: s_andn2_b32 s4, 7, s2 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1 -; GFX10-NEXT: s_and_b32 s6, s11, 7 -; GFX10-NEXT: s_lshl_b32 s4, s5, s4 -; GFX10-NEXT: s_lshr_b32 s5, s8, s6 -; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s2, s2, 7 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_or_b32 s3, s4, s5 +; GFX10-NEXT: s_lshl_b32 s4, s5, s4 +; GFX10-NEXT: s_lshr_b32 s2, s8, s2 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_and_b32 s3, s3, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s1, s3, 16 +; GFX10-NEXT: s_or_b32 s2, s4, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -1157,51 +1141,50 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX11-LABEL: s_fshr_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-NEXT: s_lshr_b32 s5, s0, 24 ; GFX11-NEXT: s_lshr_b32 s7, s1, 16 ; GFX11-NEXT: s_lshr_b32 s8, s1, 24 -; GFX11-NEXT: s_lshr_b32 s9, s2, 8 -; GFX11-NEXT: s_lshr_b32 s10, s2, 16 -; GFX11-NEXT: s_lshr_b32 s11, s2, 24 -; GFX11-NEXT: s_and_b32 s12, s2, 7 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s0, 24 +; GFX11-NEXT: s_and_b32 s11, s2, 7 +; GFX11-NEXT: s_and_not1_b32 s12, 7, s2 ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_and_b32 s2, s9, 7 +; GFX11-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, s12 +; GFX11-NEXT: s_lshr_b32 s1, s1, s11 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_and_b32 s1, s9, 7 ; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 ; GFX11-NEXT: s_lshl_b32 s3, s3, 1 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_lshr_b32 s1, s1, s12 +; GFX11-NEXT: s_and_b32 s7, s7, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s3, s9 -; GFX11-NEXT: s_lshr_b32 s2, s6, s2 -; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s3, s2 -; GFX11-NEXT: s_and_b32 s2, s10, 7 -; GFX11-NEXT: s_and_not1_b32 s3, 7, s10 +; GFX11-NEXT: s_lshr_b32 s1, s6, s1 +; GFX11-NEXT: s_and_b32 s6, s10, 7 +; GFX11-NEXT: s_and_not1_b32 s9, 7, s10 ; GFX11-NEXT: s_lshl_b32 s4, s4, 1 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_lshl_b32 s3, s4, s3 -; GFX11-NEXT: s_lshr_b32 s2, s6, s2 -; GFX11-NEXT: s_and_not1_b32 s4, 7, s11 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX11-NEXT: s_lshr_b32 s2, s2, 24 +; GFX11-NEXT: s_lshl_b32 s4, s4, s9 +; GFX11-NEXT: s_lshr_b32 s6, s7, s6 +; GFX11-NEXT: s_or_b32 s1, s3, s1 +; GFX11-NEXT: s_or_b32 s3, s4, s6 +; GFX11-NEXT: s_and_not1_b32 s4, 7, s2 ; GFX11-NEXT: s_lshl_b32 s5, s5, 1 -; GFX11-NEXT: s_and_b32 s6, s11, 7 -; GFX11-NEXT: s_lshl_b32 s4, s5, s4 -; GFX11-NEXT: s_lshr_b32 s5, s8, s6 -; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s2, s2, 7 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: s_lshl_b32 s4, s5, s4 +; GFX11-NEXT: s_lshr_b32 s2, s8, s2 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s1, s2, 16 -; GFX11-NEXT: s_and_b32 s2, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s3, 16 +; GFX11-NEXT: s_or_b32 s2, s4, s2 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_lshl_b32 s1, s2, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1264,8 +1247,7 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1307,15 +1289,13 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1357,124 +1337,118 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v7, v0 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-NEXT: v_not_b32_e32 v5, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_not_b32_e32 v8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_not_b32_e32 v10, v5 -; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: v_not_b32_e32 v14, v12 -; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 -; GFX10-NEXT: v_not_b32_e32 v10, v11 +; GFX10-NEXT: v_not_b32_e32 v10, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 +; GFX10-NEXT: v_lshlrev_b16 v0, v5, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX10-NEXT: v_not_b32_e32 v13, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_lshlrev_b16 v3, v5, v3 +; GFX10-NEXT: v_mov_b32_e32 v5, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-NEXT: v_and_b32_e32 v10, 0xff, v1 +; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v11 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v13 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 -; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v5, v5, v7 -; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v13, v6 -; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 -; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, 8 +; GFX10-NEXT: v_not_b32_e32 v5, v12 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1 +; GFX10-NEXT: v_lshrrev_b16 v2, v2, v10 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v12 +; GFX10-NEXT: v_mov_b32_e32 v8, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v4, v6, v7 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b16 v2, v5, v6 +; GFX10-NEXT: v_lshrrev_b16 v4, v7, v9 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX11-NEXT: v_not_b32_e32 v9, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-NEXT: v_not_b32_e32 v12, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX11-NEXT: v_lshlrev_b16 v3, 1, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-NEXT: v_not_b32_e32 v14, v11 -; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 -; GFX11-NEXT: v_not_b32_e32 v7, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX11-NEXT: v_not_b32_e32 v10, v2 -; GFX11-NEXT: v_lshlrev_b16 v3, v12, v3 -; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX11-NEXT: v_and_b32_e32 v12, 7, v14 +; GFX11-NEXT: v_lshlrev_b16 v0, v9, v0 ; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX11-NEXT: v_lshlrev_b16 v3, v12, v3 +; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v11 +; GFX11-NEXT: v_and_b32_e32 v9, 7, v14 ; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX11-NEXT: v_lshlrev_b16 v5, 1, v5 -; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX11-NEXT: v_not_b32_e32 v11, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 24, v1 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX11-NEXT: v_lshlrev_b16 v4, v12, v4 -; GFX11-NEXT: v_lshrrev_b16 v6, v11, v8 -; GFX11-NEXT: v_lshlrev_b16 v5, v7, v5 -; GFX11-NEXT: v_lshrrev_b16 v7, v13, v9 -; GFX11-NEXT: v_lshlrev_b16 v0, v10, v0 +; GFX11-NEXT: v_lshlrev_b16 v4, v9, v4 +; GFX11-NEXT: v_lshrrev_b16 v6, v7, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v11 +; GFX11-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX11-NEXT: v_and_b32_e32 v8, 7, v13 ; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v6 -; GFX11-NEXT: v_or_b32_e32 v4, v5, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v4, v6 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v4, v7, v5 +; GFX11-NEXT: v_lshrrev_b16 v5, v8, v10 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 ; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -1922,8 +1896,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_bfe_u32 v2, v0, 8, 8 ; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 @@ -1936,19 +1909,17 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-LABEL: s_fshr_v2i24: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX8-NEXT: s_lshr_b32 s9, s1, 8 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 +; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_and_b32 s6, s6, 0xff -; GFX8-NEXT: s_or_b32 s1, s8, s1 -; GFX8-NEXT: s_lshr_b32 s8, s2, 8 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_and_b32 s8, s8, 0xff +; GFX8-NEXT: s_or_b32 s1, s8, s1 +; GFX8-NEXT: s_lshr_b32 s8, s2, 8 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: s_and_b32 s7, s9, 0xff @@ -1956,27 +1927,24 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: s_lshr_b32 s10, s2, 24 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_or_b32 s2, s2, s8 ; GFX8-NEXT: s_and_b32 s8, s9, 0xff -; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: s_lshr_b32 s11, s3, 8 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_or_b32 s2, s2, s8 ; GFX8-NEXT: s_lshl_b32 s3, s3, 8 ; GFX8-NEXT: s_and_b32 s8, s11, 0xff -; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX8-NEXT: s_or_b32 s3, s10, s3 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s8 ; GFX8-NEXT: s_lshr_b32 s8, s4, 8 -; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s4, 16 ; GFX8-NEXT: s_lshr_b32 s10, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff @@ -1991,10 +1959,9 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX8-NEXT: s_lshr_b32 s11, s5, 8 -; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX8-NEXT: s_and_b32 s8, s11, 0xff +; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX8-NEXT: s_or_b32 s5, s10, s5 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 @@ -2044,8 +2011,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -2056,19 +2022,17 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-LABEL: s_fshr_v2i24: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX9-NEXT: s_lshr_b32 s9, s1, 8 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s6, s0, 8 ; GFX9-NEXT: s_lshr_b32 s8, s0, 24 +; GFX9-NEXT: s_lshr_b32 s9, s1, 8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xff -; GFX9-NEXT: s_or_b32 s1, s8, s1 -; GFX9-NEXT: s_lshr_b32 s8, s2, 8 ; GFX9-NEXT: s_lshr_b32 s7, s0, 16 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_or_b32 s1, s8, s1 +; GFX9-NEXT: s_lshr_b32 s8, s2, 8 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: s_or_b32 s0, s0, s6 ; GFX9-NEXT: s_and_b32 s6, s7, 0xff ; GFX9-NEXT: s_and_b32 s7, s9, 0xff @@ -2076,27 +2040,24 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: s_lshr_b32 s10, s2, 24 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_or_b32 s2, s2, s8 ; GFX9-NEXT: s_and_b32 s8, s9, 0xff -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX9-NEXT: s_lshr_b32 s11, s3, 8 ; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_and_b32 s3, s3, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_or_b32 s2, s2, s8 ; GFX9-NEXT: s_lshl_b32 s3, s3, 8 ; GFX9-NEXT: s_and_b32 s8, s11, 0xff -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX9-NEXT: s_or_b32 s3, s10, s3 ; GFX9-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_or_b32 s3, s3, s8 ; GFX9-NEXT: s_lshr_b32 s8, s4, 8 -; GFX9-NEXT: s_and_b32 s8, s8, 0xff ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 ; GFX9-NEXT: s_lshr_b32 s10, s4, 24 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff @@ -2111,7 +2072,6 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX9-NEXT: s_lshr_b32 s11, s5, 8 -; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 ; GFX9-NEXT: s_and_b32 s8, s11, 0xff ; GFX9-NEXT: s_or_b32 s5, s10, s5 @@ -2162,8 +2122,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_and_or_b32 v2, v1, v2, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_bfe_u32 v2, v0, 8, 8 ; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 8 @@ -2177,114 +2136,107 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: s_lshr_b32 s14, s4, 8 ; GFX10-NEXT: s_lshr_b32 s15, s4, 16 -; GFX10-NEXT: s_and_b32 s14, s14, 0xff ; GFX10-NEXT: s_lshr_b32 s16, s4, 24 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_and_b32 s15, s15, 0xff +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_lshl_b32 s14, s14, 8 -; GFX10-NEXT: s_and_b32 s15, 0xffff, s15 +; GFX10-NEXT: s_and_b32 s15, s15, 0xff ; GFX10-NEXT: s_or_b32 s4, s4, s14 +; GFX10-NEXT: s_and_b32 s14, 0xffff, s15 ; GFX10-NEXT: s_lshr_b32 s17, s5, 8 -; GFX10-NEXT: s_and_b32 s5, s5, 0xff -; GFX10-NEXT: s_lshl_b32 s14, s15, 16 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX10-NEXT: s_lshl_b32 s14, s14, 16 +; GFX10-NEXT: s_or_b32 s5, s16, s5 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_and_b32 s15, s17, 0xff +; GFX10-NEXT: s_and_b32 s16, s17, 0xff ; GFX10-NEXT: s_or_b32 s4, s4, s14 -; GFX10-NEXT: s_or_b32 s5, s16, s5 -; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: s_and_b32 s14, 0xffff, s15 +; GFX10-NEXT: s_and_b32 s15, 0xffff, s16 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX10-NEXT: s_lshl_b32 s14, s14, 16 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: s_lshl_b32 s14, s15, 16 +; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: s_or_b32 s5, s5, s14 ; GFX10-NEXT: s_lshr_b32 s9, s1, 8 ; GFX10-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 -; GFX10-NEXT: s_or_b32 s5, s5, s14 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_lshr_b32 s10, s2, 8 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 ; GFX10-NEXT: s_lshr_b32 s11, s2, 16 -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s9, s9, 0xff -; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX10-NEXT: s_and_b32 s10, s10, 0xff ; GFX10-NEXT: s_lshr_b32 s12, s2, 24 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: s_and_b32 s8, 0xffff, s9 +; GFX10-NEXT: s_and_b32 s8, s9, 0xff +; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: s_lshl_b32 s9, s10, 8 +; GFX10-NEXT: s_and_b32 s10, s11, 0xff ; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: s_or_b32 s2, s2, s9 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshr_b32 s13, s3, 8 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 ; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_and_b32 s3, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_and_b32 s7, s7, 0xff ; GFX10-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX10-NEXT: s_and_b32 s6, s6, 0xff -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_and_b32 s11, s13, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_or_b32 s3, s12, s3 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s7 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: s_and_b32 s7, s7, 0xff -; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10-NEXT: s_lshl_b32 s6, s6, 17 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_or_b32 s0, s6, s0 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s4, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0 -; GFX10-NEXT: s_and_b32 s4, s11, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s10 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s11 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: s_lshl_b32 s1, s1, 1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 ; GFX10-NEXT: s_or_b32 s2, s2, s4 -; GFX10-NEXT: s_and_b32 s4, s13, 0xff +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: s_lshl_b32 s7, s7, 17 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 16 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: s_lshl_b32 s4, s7, 17 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s8, 17 +; GFX10-NEXT: s_or_b32 s2, s3, s5 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s3 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 ; GFX10-NEXT: v_lshl_or_b32 v1, s0, v2, v1 -; GFX10-NEXT: s_or_b32 s0, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: s_or_b32 s0, s7, s1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v0 -; GFX10-NEXT: v_and_or_b32 v2, 0xff, v1, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX10-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v1, v2 ; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX10-NEXT: v_or3_b32 v1, v2, v1, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v4 +; GFX10-NEXT: v_or3_b32 v1, v1, v3, v4 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v5 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -2294,124 +2246,117 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX11-NEXT: s_lshr_b32 s14, s4, 8 ; GFX11-NEXT: s_lshr_b32 s15, s4, 16 -; GFX11-NEXT: s_and_b32 s14, s14, 0xff ; GFX11-NEXT: s_lshr_b32 s16, s4, 24 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s15, s15, 0xff +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: s_and_b32 s15, 0xffff, s15 +; GFX11-NEXT: s_and_b32 s15, s15, 0xff ; GFX11-NEXT: s_or_b32 s4, s4, s14 +; GFX11-NEXT: s_and_b32 s14, 0xffff, s15 ; GFX11-NEXT: s_lshr_b32 s17, s5, 8 -; GFX11-NEXT: s_and_b32 s5, s5, 0xff -; GFX11-NEXT: s_lshl_b32 s14, s15, 16 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: s_lshl_b32 s14, s14, 16 ; GFX11-NEXT: s_lshl_b32 s5, s5, 8 -; GFX11-NEXT: s_and_b32 s15, s17, 0xff ; GFX11-NEXT: s_or_b32 s4, s4, s14 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: s_and_b32 s14, s17, 0xff ; GFX11-NEXT: s_or_b32 s5, s16, s5 -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_and_b32 s14, 0xffff, s15 +; GFX11-NEXT: s_and_b32 s14, 0xffff, s14 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_lshl_b32 s14, s14, 16 -; GFX11-NEXT: s_lshr_b32 s10, s2, 8 -; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 +; GFX11-NEXT: s_lshr_b32 s6, s0, 8 ; GFX11-NEXT: s_or_b32 s5, s5, s14 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 +; GFX11-NEXT: s_lshr_b32 s8, s0, 24 ; GFX11-NEXT: s_lshr_b32 s9, s1, 8 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_lshr_b32 s10, s2, 8 ; GFX11-NEXT: s_lshr_b32 s11, s2, 16 -; GFX11-NEXT: s_and_b32 s10, s10, 0xff -; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_lshr_b32 s8, s0, 24 -; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-NEXT: s_lshr_b32 s12, s2, 24 +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s9, s9, 0xff -; GFX11-NEXT: s_and_b32 s11, s11, 0xff -; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_and_b32 s7, s7, 0xff ; GFX11-NEXT: s_or_b32 s1, s8, s1 +; GFX11-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s10, 8 +; GFX11-NEXT: s_and_b32 s10, s11, 0xff ; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX11-NEXT: s_and_b32 s8, 0xffff, s9 -; GFX11-NEXT: s_and_b32 s9, 0xffff, s11 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s6 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s7 +; GFX11-NEXT: s_or_b32 s2, s2, s9 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s10 ; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: s_and_b32 s7, s7, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s6 -; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_lshl_b32 s6, s6, 17 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_lshr_b32 s13, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s6, s0 ; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_and_b32 s13, s13, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_and_b32 s11, s13, 0xff ; GFX11-NEXT: s_or_b32 s3, s12, s3 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 1 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0 -; GFX11-NEXT: s_lshl_b32 s4, s10, 8 -; GFX11-NEXT: s_and_b32 s10, 0xffff, s13 -; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_lshl_b32 s5, s7, 16 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s8 +; GFX11-NEXT: s_or_b32 s2, s2, s5 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: s_lshl_b32 s4, s9, 16 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_and_b32 s8, 0xffff, s11 +; GFX11-NEXT: s_lshl_b32 s4, s4, 17 +; GFX11-NEXT: s_lshl_b32 s7, s8, 16 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_lshl_b32 s4, s7, 17 -; GFX11-NEXT: s_lshl_b32 s5, s10, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 -; GFX11-NEXT: s_or_b32 s0, s4, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: s_lshl_b32 s1, s1, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 -; GFX11-NEXT: s_or_b32 s2, s3, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_or_b32 s2, s3, s7 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_lshl_or_b32 v1, s0, v2, v1 -; GFX11-NEXT: s_lshl_b32 s0, s8, 17 -; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s0, s4, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 -; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8 +; GFX11-NEXT: v_bfe_u32 v3, v1, 8, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshl_or_b32 v0, s0, v2, v0 -; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 -; GFX11-NEXT: v_bfe_u32 v2, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4 +; GFX11-NEXT: v_or3_b32 v1, v1, v2, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v2 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 @@ -4231,7 +4176,6 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s7, s7, 0xffff ; GFX6-NEXT: s_and_b32 s6, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_or_b32 s6, s6, s7 @@ -4484,7 +4428,6 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX6-LABEL: v_fshr_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index d5bfb7faf7fc..adf301a376f2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -17,7 +17,6 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -37,7 +36,6 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -71,36 +69,33 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 0 -; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s4, s0 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v2i8_s_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s5, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s5, 0 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s5, 0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s0 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s5, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 @@ -125,7 +120,6 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -143,7 +137,6 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -177,36 +170,34 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 0 -; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s2, s0 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v2i8_s_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 1 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_nop 0 @@ -229,7 +220,6 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -248,7 +238,6 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -281,37 +270,35 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8 ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc_lo +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v2i8_v_s: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0 ; GFX11-NEXT: global_load_u16 v1, v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_and_b32 v1, 0xff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_nop 0 @@ -335,7 +322,6 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -355,7 +341,6 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -389,38 +374,35 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, s4, vcc_lo +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v2i8_s_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: global_load_u16 v1, v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s4, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, s4, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_nop 0 @@ -443,7 +425,6 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -462,7 +443,6 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -495,13 +475,12 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-NEXT: global_load_ushort v2, v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -511,21 +490,20 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; ; GFX11-LABEL: insertelement_s_v2i8_v_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: global_load_u16 v2, v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_and_b32 v1, 0xff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_nop 0 @@ -548,7 +526,6 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -566,7 +543,6 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -600,12 +576,11 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -616,20 +591,18 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX11-LABEL: insertelement_v_v2i8_s_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_nop 0 @@ -651,7 +624,6 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -668,7 +640,6 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -701,12 +672,11 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -717,16 +687,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX11-LABEL: insertelement_v_v2i8_v_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v2, 0xff, v0 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -752,7 +719,6 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -769,7 +735,6 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -802,12 +767,11 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -818,16 +782,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX11-LABEL: insertelement_v_v2i8_v_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v2, 0xff, v0 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll index 19b0057d69b6..b5c3367e055d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -315,7 +315,6 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_xy(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: ; return to shader part epilog @@ -392,7 +391,6 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_xz(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x5 unorm d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: ; return to shader part epilog @@ -469,7 +467,6 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_xw(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: ; return to shader part epilog @@ -546,7 +543,6 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_yz(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x6 unorm d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: ; return to shader part epilog @@ -623,10 +619,9 @@ define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: ; return to shader part epilog ; ; GFX8-PACKED-LABEL: load_1d_v3f16_xyz: @@ -702,8 +697,6 @@ define amdgpu_ps <4 x half> @load_1d_v4f16_xyzw(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm d16 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 47e476de74cf..0881686e27d4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -51,13 +51,11 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float % ; GFX10-LABEL: image_bvh_intersect_ray_a16: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff, v7 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v7 ; GFX10-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v5, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX10-NEXT: v_and_or_b32 v6, 0xffff, v6, v10 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v5, v9 ; GFX10-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -114,13 +112,11 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float ; GFX10-LABEL: image_bvh64_intersect_ray_a16: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v11, 0xffff, v8 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v8 ; GFX10-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GFX10-NEXT: v_and_or_b32 v6, 0xffff, v6, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX10-NEXT: v_and_or_b32 v7, 0xffff, v7, v11 +; GFX10-NEXT: v_and_or_b32 v6, 0xffff, v6, v10 ; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -254,19 +250,17 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: v_mov_b32_e32 v13, v0 -; GFX1030-NEXT: v_mov_b32_e32 v14, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; GFX1030-NEXT: v_mov_b32_e32 v14, v1 +; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v7 ; GFX1030-NEXT: v_mov_b32_e32 v15, v2 -; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v8 ; GFX1030-NEXT: v_mov_b32_e32 v16, v3 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX1030-NEXT: v_mov_b32_e32 v17, v4 -; GFX1030-NEXT: v_alignbit_b32 v20, v2, v7, 16 +; GFX1030-NEXT: v_and_or_b32 v19, 0xffff, v6, v1 +; GFX1030-NEXT: v_alignbit_b32 v20, v8, v7, 16 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: v_and_or_b32 v18, 0xffff, v5, v0 -; GFX1030-NEXT: v_and_or_b32 v19, 0xffff, v6, v1 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v10 @@ -297,14 +291,12 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; GFX1013-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GFX1013-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX1013-NEXT: v_and_or_b32 v5, 0xffff, v5, v13 ; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v14 +; GFX1013-NEXT: v_and_or_b32 v5, 0xffff, v5, v13 ; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v10 @@ -485,19 +477,17 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: v_mov_b32_e32 v14, v0 -; GFX1030-NEXT: v_mov_b32_e32 v15, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX1030-NEXT: v_mov_b32_e32 v15, v1 +; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v8 ; GFX1030-NEXT: v_mov_b32_e32 v16, v2 -; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; GFX1030-NEXT: v_mov_b32_e32 v17, v3 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX1030-NEXT: v_mov_b32_e32 v18, v4 ; GFX1030-NEXT: v_mov_b32_e32 v19, v5 -; GFX1030-NEXT: v_alignbit_b32 v22, v2, v8, 16 -; GFX1030-NEXT: v_and_or_b32 v20, 0xffff, v6, v0 ; GFX1030-NEXT: v_and_or_b32 v21, 0xffff, v7, v1 +; GFX1030-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; GFX1030-NEXT: v_and_or_b32 v20, 0xffff, v6, v0 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 @@ -530,14 +520,12 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: ; GFX1013-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; GFX1013-NEXT: v_and_b32_e32 v15, 0xffff, v8 -; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v8 +; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v14 ; GFX1013-NEXT: v_and_or_b32 v7, 0xffff, v7, v15 +; GFX1013-NEXT: v_and_or_b32 v6, 0xffff, v6, v14 ; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v10 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v11 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll index 36d5e914d40b..af695287991c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll @@ -86,11 +86,9 @@ define amdgpu_ps <2 x half> @raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffs ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -157,19 +155,15 @@ define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffs ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec + ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[COPY9]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 @@ -391,19 +385,15 @@ define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffs ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec + ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[COPY9]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll index 5b19b1c913a9..29acd4277e7b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll @@ -70,11 +70,9 @@ define amdgpu_ps <2 x half> @raw_ptr_buffer_load_format_v2f16__sgpr_rsrc__vgpr_v ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -126,19 +124,15 @@ define amdgpu_ps <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_v ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec + ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[COPY9]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 @@ -291,19 +285,15 @@ define amdgpu_ps <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_v ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec + ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[COPY9]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll index a799e203d643..54fe9c383e65 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll @@ -54,11 +54,9 @@ define amdgpu_ps <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sg ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -107,19 +105,15 @@ define amdgpu_ps <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sg ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec + ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[COPY9]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll index 1cfb15391be3..4c8e2591b14c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll @@ -70,11 +70,9 @@ define amdgpu_ps <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sg ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec - ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY9]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -138,19 +136,15 @@ define amdgpu_ps <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sg ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY7]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec + ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[COPY9]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll index 06560afee3c9..ad7e5651c33e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -46,16 +46,14 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) { ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX906-NEXT: v_and_or_b32 v0, v0, v9, v1 ; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v7 ; GFX906-NEXT: v_and_or_b32 v1, v4, v9, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v7 ; GFX906-NEXT: v_or3_b32 v1, v1, v2, v3 ; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v8 ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -64,18 +62,16 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, 8 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, 0xff, v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v4, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v7 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5 ; GFX10-NEXT: v_dot4c_i32_i8 v8, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll index 870588014cd2..a68359a95056 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll @@ -78,11 +78,9 @@ define amdgpu_ps <2 x half> @struct_buffer_load_format_v2f16__sgpr_rsrc__vgpr_vi ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -152,19 +150,15 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__sgpr_rsrc__vgpr_vi ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec - ; UNPACKED-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY13]], implicit $exec + ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY14]], [[COPY10]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 @@ -272,19 +266,15 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY17]], implicit $exec - ; UNPACKED-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY18]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY19]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY18]], [[COPY14]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY20]], implicit $exec - ; UNPACKED-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[COPY21]], implicit $exec - ; UNPACKED-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY22]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY19]], implicit $exec + ; UNPACKED-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY20]], [[COPY16]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll index 6c0319ef570d..352d954c9ea3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll @@ -60,11 +60,9 @@ define amdgpu_ps <2 x half> @struct_ptr_buffer_load_format_v2f16__sgpr_rsrc__vgp ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -117,19 +115,15 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgp ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec - ; UNPACKED-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY13]], implicit $exec + ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY14]], [[COPY10]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 @@ -217,19 +211,15 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY17]], implicit $exec - ; UNPACKED-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY18]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY19]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY18]], [[COPY14]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY20]], implicit $exec - ; UNPACKED-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[COPY21]], implicit $exec - ; UNPACKED-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY22]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY19]], implicit $exec + ; UNPACKED-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY20]], [[COPY16]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll index 1a57c2e77bdd..7ecb91e1a9fb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll @@ -79,11 +79,9 @@ define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__ ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -139,19 +137,15 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__ ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec - ; UNPACKED-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY13]], implicit $exec + ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY14]], [[COPY10]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 @@ -315,19 +309,15 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY17]], implicit $exec - ; UNPACKED-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY18]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY19]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY18]], [[COPY14]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY20]], implicit $exec - ; UNPACKED-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[COPY21]], implicit $exec - ; UNPACKED-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY22]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY19]], implicit $exec + ; UNPACKED-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY20]], [[COPY16]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll index f270f87aae66..b6850bb1b2f2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -114,11 +114,9 @@ define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__ ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -194,19 +192,15 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__ ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec - ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[COPY8]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec - ; UNPACKED-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec - ; UNPACKED-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY13]], implicit $exec + ; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY14]], [[COPY10]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 @@ -445,19 +439,15 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__ ; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; UNPACKED-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY17]], implicit $exec - ; UNPACKED-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY18]], implicit $exec ; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY19]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY18]], [[COPY14]], implicit $exec ; UNPACKED-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; UNPACKED-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY20]], implicit $exec - ; UNPACKED-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED-NEXT: [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[COPY21]], implicit $exec - ; UNPACKED-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY22]], [[V_AND_B32_e64_3]], implicit $exec - ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; UNPACKED-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY19]], implicit $exec + ; UNPACKED-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; UNPACKED-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY20]], [[COPY16]], implicit $exec + ; UNPACKED-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec ; UNPACKED-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] ; UNPACKED-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll index b14af9e043e0..0c061d0aa02d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -46,16 +46,14 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) { ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX906-NEXT: v_and_or_b32 v0, v0, v9, v1 ; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v7 ; GFX906-NEXT: v_and_or_b32 v1, v4, v9, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v7 ; GFX906-NEXT: v_or3_b32 v1, v1, v2, v3 ; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v8 ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -64,18 +62,16 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, 8 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, 0xff, v4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v4, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v7 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5 ; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v8 @@ -87,19 +83,17 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-NEXT: v_and_or_b32 v4, 0xff, v4, v5 -; GFX11-NEXT: v_or3_b32 v0, v0, v2, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX11-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v4, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: v_or3_b32 v1, v3, v4, v5 ; GFX11-NEXT: v_dot4_u32_u8 v0, v0, v1, v8 ; GFX11-NEXT: s_setpc_b64 s[30:31] %a.cast = bitcast <4 x i8> %a to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll index e7119c89ac06..04cea6567439 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -687,11 +687,9 @@ define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff ; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: s_and_b32 s5, s5, 0xffff ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_or_b32 s6, s5, s6 ; GFX6-NEXT: s_and_b32 s7, s7, 0xffff @@ -740,11 +738,9 @@ define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v3i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff ; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: s_and_b32 s5, s5, 0xffff ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_or_b32 s6, s5, s6 ; GFX6-NEXT: s_and_b32 s7, s7, 0xffff @@ -793,7 +789,6 @@ define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inre define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v3i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s6, s6, 0xffff ; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: s_and_b32 s5, s5, 0xffff ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 @@ -801,9 +796,8 @@ define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 ; GFX6-NEXT: s_or_b32 s6, s5, s6 ; GFX6-NEXT: s_and_b32 s7, s7, 0xffff ; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] -; GFX6-NEXT: s_and_b32 s1, s3, 0xffff ; GFX6-NEXT: s_and_b32 s0, s2, 0xffff -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s4, 0xffff ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] @@ -865,10 +859,8 @@ define <3 x i16> @v_orn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) { ; GFX6-LABEL: v_orn2_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index a6f9bb7ee055..46dfe75fddc3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -279,9 +279,10 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_max_i16_e32 v2, v4, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 +; GFX8-NEXT: v_ashrrev_i16_e32 v1, 8, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -298,8 +299,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -315,9 +316,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff +; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -336,7 +337,6 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -411,11 +411,10 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s1, s1, 8 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_ashr_i32 s0, s0, 8 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_ashr_i32 s1, s1, 8 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -438,8 +437,8 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -458,10 +457,10 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff +; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -484,7 +483,6 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 @@ -555,8 +553,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -608,10 +605,11 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 8, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -771,8 +769,7 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_ashr_i32 s3, s3, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, 0xff -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_lshl_b32 s1, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -856,8 +853,7 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_ashr_i32 s3, s3, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s3, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -2828,9 +2824,8 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-NEXT: s_max_i32 s2, s4, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s3 ; GFX6-NEXT: s_add_i32 s1, s1, s2 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -2909,9 +2904,8 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2977,9 +2971,8 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3068,22 +3061,20 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3179,19 +3170,17 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 ; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s4, s6, s4 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; @@ -3349,27 +3338,24 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v5 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3499,27 +3485,24 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 ; GFX6-NEXT: s_max_i32 s7, s5, 0 ; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s7 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_add_i32 s5, s5, s6 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_lshl_b32 s3, s5, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; @@ -3715,37 +3698,33 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v7 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3907,37 +3886,33 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_min_i32 s10, s7, 0 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_add_i32 s6, s6, s8 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 ; GFX6-NEXT: s_max_i32 s9, s7, 0 ; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_add_i32 s7, s7, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: s_ashr_i32 s7, s7, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_ashr_i32 s6, s6, 16 +; GFX6-NEXT: s_ashr_i32 s7, s7, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s5, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index b666f4552166..e043823d28de 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -2401,7 +2401,6 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2410,8 +2409,7 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_short v[0:1], v4 -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -2481,25 +2479,21 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: s_xor_b32 s4, s11, s9 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 -; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 +; GFX9-NEXT: s_xor_b32 s4, s11, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 +; GFX9-NEXT: v_sub_u32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 +; GFX9-NEXT: v_sub_u32_sdwa v3, v3, s11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -2507,17 +2501,18 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80018 -; GFX10-NEXT: s_bfe_i32 s3, s0, 0x80010 -; GFX10-NEXT: s_ashr_i32 s2, s1, 31 -; GFX10-NEXT: s_ashr_i32 s8, s3, 31 -; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s3, s3, s8 -; GFX10-NEXT: s_xor_b32 s1, s1, s2 -; GFX10-NEXT: s_xor_b32 s3, s3, s8 +; GFX10-NEXT: s_bfe_i32 s1, s0, 0x80010 +; GFX10-NEXT: s_bfe_i32 s2, s0, 0x80018 +; GFX10-NEXT: s_ashr_i32 s3, s1, 31 +; GFX10-NEXT: s_ashr_i32 s8, s2, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s3 +; GFX10-NEXT: s_add_i32 s2, s2, s8 +; GFX10-NEXT: s_xor_b32 s1, s1, s3 +; GFX10-NEXT: s_xor_b32 s2, s2, s8 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: s_sub_i32 s7, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2525,15 +2520,14 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s3 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX10-NEXT: s_bfe_i32 s6, s0, 0x80008 -; GFX10-NEXT: s_sext_i32_i8 s0, s0 +; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 +; GFX10-NEXT: s_sext_i32_i8 s6, s0 +; GFX10-NEXT: s_bfe_i32 s0, s0, 0x80008 ; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 @@ -2542,46 +2536,43 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: s_xor_b32 s1, s9, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v3 +; GFX10-NEXT: s_xor_b32 s1, s9, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: s_xor_b32 s0, s10, s8 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xff +; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 +; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 +; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s10, v3 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v3, s10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_short v1, v0, s[4:5] ; GFX10-NEXT: global_store_short v1, v2, s[6:7] @@ -2807,16 +2798,14 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 ; GFX8-NEXT: s_xor_b32 s0, s2, s10 -; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index bac80f0777c0..56561d163bb9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -709,9 +709,8 @@ define <2 x i16> @v_sext_inreg_v2i16_15(<2 x i16> %value) { define amdgpu_ps i32 @s_sext_inreg_v2i16_11(<2 x i16> inreg %value) { ; GFX6-LABEL: s_sext_inreg_v2i16_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s1, s1, 0x50000 ; GFX6-NEXT: s_bfe_i32 s0, s0, 0x50000 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_bfe_i32 s1, s1, 0x50000 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -781,17 +780,15 @@ define <2 x float> @v_sext_inreg_v4i16_3(<4 x i16> %value) { ; GFX6-LABEL: v_sext_inreg_v4i16_3: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 13 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 13 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 13 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 13 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 13 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -837,17 +834,15 @@ define <2 x float> @v_sext_inreg_v4i16_3(<4 x i16> %value) { define amdgpu_ps <2 x i32> @s_sext_inreg_v4i16_14(<4 x i16> inreg %value) { ; GFX6-LABEL: s_sext_inreg_v4i16_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s1, s1, 0x20000 ; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20000 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_bfe_i32 s1, s1, 0x20000 ; GFX6-NEXT: s_bfe_i32 s2, s2, 0x20000 ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x20000 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; @@ -953,29 +948,25 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) { ; GFX6-LABEL: v_sext_inreg_v8i16_11: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 5 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 5 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 5 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 5 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 5 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 5 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 5 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 5 +; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1039,29 +1030,25 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) { define amdgpu_ps <4 x i32> @s_sext_inreg_v8i16_5(<8 x i16> inreg %value) { ; GFX6-LABEL: s_sext_inreg_v8i16_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s1, s1, 0xb0000 ; GFX6-NEXT: s_bfe_i32 s0, s0, 0xb0000 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_bfe_i32 s1, s1, 0xb0000 ; GFX6-NEXT: s_bfe_i32 s2, s2, 0xb0000 ; GFX6-NEXT: s_bfe_i32 s3, s3, 0xb0000 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_bfe_i32 s4, s4, 0xb0000 ; GFX6-NEXT: s_bfe_i32 s5, s5, 0xb0000 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_bfe_i32 s4, s4, 0xb0000 -; GFX6-NEXT: s_bfe_i32 s7, s7, 0xb0000 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_bfe_i32 s6, s6, 0xb0000 +; GFX6-NEXT: s_bfe_i32 s7, s7, 0xb0000 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s5, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 7ad19a479700..d48fda055059 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -10,24 +10,15 @@ define amdgpu_ps i64 @s_shl_i64_zext_i32(i32 inreg %x) { ; GCN-LABEL: s_shl_i64_zext_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_andn2_b32 s0, s0, -2.0 ; GCN-NEXT: s_lshl_b32 s0, s0, 2 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i64_zext_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_andn2_b32 s0, s0, -2.0 -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: s_shl_i64_zext_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_not1_b32 s0, s0, -2.0 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_lshl_b32 s0, s0, 2 -; GFX11-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i64_zext_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: ; return to shader part epilog %and = and i32 %x, 1073741823 %ext = zext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -38,7 +29,6 @@ define i64 @v_shl_i64_zext_i32(i32 %x) { ; GCN-LABEL: v_shl_i64_zext_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -46,16 +36,14 @@ define i64 @v_shl_i64_zext_i32(i32 %x) { ; GFX10-LABEL: v_shl_i64_zext_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_shl_i64_zext_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3fffffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %and = and i32 %x, 1073741823 %ext = zext i32 %and to i64 @@ -553,7 +541,6 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) { define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) { ; GFX7-LABEL: s_shl_i32_zext_i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff ; GFX7-NEXT: s_lshl_b32 s0, s0, 2 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: ; return to shader part epilog @@ -585,7 +572,6 @@ define i32 @v_shl_i32_zext_i16(i16 %x) { ; GFX7-LABEL: v_shl_i32_zext_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -593,21 +579,18 @@ define i32 @v_shl_i32_zext_i16(i16 %x) { ; GFX8-LABEL: v_shl_i32_zext_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, 0x3fff, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_shl_i32_zext_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0x3fff, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_shl_i32_zext_i16: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0x3fff, v0 ; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 2, v0 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index c2f911cc4458..ff43ffc0382e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -780,9 +780,8 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) { define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s1, s1, s3 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s1, s1, s3 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -825,11 +824,10 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: shl_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 -; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -865,7 +863,6 @@ define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -916,14 +913,12 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) { ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -959,17 +954,15 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) { define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s1, s1, s5 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s1, s1, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, s7 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; @@ -1063,7 +1056,6 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) { ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v12 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v13 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, v8, v5 ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v14 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1072,17 +1064,14 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) { ; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v15 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, v8, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1128,29 +1117,25 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) { define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s1, s1, s9 ; GFX6-NEXT: s_lshl_b32 s0, s0, s8 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s1, s1, s9 ; GFX6-NEXT: s_lshl_b32 s2, s2, s10 ; GFX6-NEXT: s_lshl_b32 s3, s3, s11 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshl_b32 s4, s4, s12 ; GFX6-NEXT: s_lshl_b32 s5, s5, s13 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_lshl_b32 s4, s4, s12 -; GFX6-NEXT: s_lshl_b32 s7, s7, s15 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_lshl_b32 s6, s6, s14 +; GFX6-NEXT: s_lshl_b32 s7, s7, s15 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s5, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 320dfbb4980e..f6ee4b5d022f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -279,9 +279,10 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 +; GFX8-NEXT: v_ashrrev_i16_e32 v1, 8, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -298,8 +299,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -315,9 +316,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff +; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -336,7 +337,6 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -411,11 +411,10 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_sub_i32 s1, s1, s2 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s1, s1, 8 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_ashr_i32 s0, s0, 8 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_ashr_i32 s1, s1, 8 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -438,8 +437,8 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -458,10 +457,10 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff +; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -484,7 +483,6 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 @@ -555,8 +553,7 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -608,10 +605,11 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 8, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -771,8 +769,7 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_ashr_i32 s3, s3, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, 0xff -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_lshl_b32 s1, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -856,8 +853,7 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_ashr_i32 s3, s3, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshl_b32 s1, s3, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -2828,9 +2824,8 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-NEXT: s_max_i32 s2, s3, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -2909,9 +2904,8 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_max_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_min_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2977,9 +2971,8 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3069,21 +3062,19 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3179,19 +3170,17 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_min_i32 s6, s3, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; @@ -3349,27 +3338,24 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3499,27 +3485,24 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 ; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s5, -1 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_sub_i32 s5, s5, s6 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_lshl_b32 s3, s5, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; @@ -3715,37 +3698,33 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3907,37 +3886,33 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s7, -1 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s6, s6, s8 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 ; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s7, -1 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_sub_i32 s7, s7, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: s_ashr_i32 s7, s7, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_ashr_i32 s6, s6, 16 +; GFX6-NEXT: s_ashr_i32 s7, s7, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, 0xffff -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s5, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s6, 0xffff -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 1821d29d4b05..02c671a3ccbf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -225,8 +225,8 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -242,9 +242,9 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff +; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -263,7 +263,6 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -330,8 +329,8 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -350,10 +349,10 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff +; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -376,7 +375,6 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 @@ -452,10 +450,11 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -615,23 +614,24 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: s_lshl_b32 s0, s2, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp ; GFX8-NEXT: s_lshl_b32 s1, s6, 8 -; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp ; GFX8-NEXT: s_lshl_b32 s0, s3, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_lshl_b32 s1, s7, 8 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp ; GFX8-NEXT: s_lshl_b32 s0, s4, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index a58397eccaba..4014437a2754 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -1931,7 +1931,6 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc @@ -1939,8 +1938,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_short v[0:1], v4 -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1996,7 +1994,6 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -2004,8 +2001,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm @@ -2020,17 +2016,17 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX10-NEXT: s_sub_i32 s3, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2040,30 +2036,29 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v4, 0xff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2241,12 +2236,10 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index a60370cd460f..43547d7f3a76 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -219,8 +219,8 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -236,9 +236,9 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff +; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -257,7 +257,6 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -322,8 +321,8 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -342,10 +341,10 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff +; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -368,7 +367,6 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 @@ -440,10 +438,11 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -599,23 +598,24 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: s_lshl_b32 s0, s2, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp ; GFX8-NEXT: s_lshl_b32 s1, s6, 8 -; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp ; GFX8-NEXT: s_lshl_b32 s0, s3, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_lshl_b32 s1, s7, 8 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp ; GFX8-NEXT: s_lshl_b32 s0, s4, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index d9cbbc11f9a7..305108330d19 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4420,9 +4420,8 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrs ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -4460,9 +4459,8 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128 ; GFX12-GISEL: ; %bb.0: ; GFX12-GISEL-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 526ee5a51745..942c61f33943 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -382,39 +382,36 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i16: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX67-GISEL-NEXT: v_mov_b32_e32 v4, 0x10000 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -917,74 +914,67 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-LABEL: clpeak_imad_pat_v4i16: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v10, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v9 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v8, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v10, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v9, v6 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v11, v7 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v11, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v10 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v4 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v6 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v11 +; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; GFX67-GISEL-NEXT: v_mov_b32_e32 v7, 0x10000 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v2 +; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 +; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9 +; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GFX67-GISEL-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX67-GISEL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX67-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX67-GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v7 +; GFX67-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GFX67-GISEL-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -1422,39 +1412,36 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-LABEL: clpeak_umad_pat_v2i16: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX67-GISEL-NEXT: v_mov_b32_e32 v4, 0x10000 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1957,74 +1944,67 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-LABEL: clpeak_umad_pat_v4i16: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v10, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v9 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v8, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v10, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v9, v6 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v11, v7 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v11, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v10 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v4 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v6 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v11 +; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; GFX67-GISEL-NEXT: v_mov_b32_e32 v7, 0x10000 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v2 +; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 +; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9 +; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GFX67-GISEL-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX67-GISEL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX67-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX67-GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v7 +; GFX67-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GFX67-GISEL-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -7029,83 +7009,76 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i16_x2: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX67-GISEL-NEXT: v_mov_b32_e32 v4, 0x10000 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -7337,83 +7310,76 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-LABEL: clpeak_umad_pat_v2i16_x2: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX67-GISEL-NEXT: v_mov_b32_e32 v4, 0x10000 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v1 +; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -8017,9 +7983,8 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 1a55bf608ebf..d8ce36429ca4 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1667,7 +1667,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 ; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] @@ -1795,7 +1794,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 7, v2 ; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] @@ -1936,7 +1934,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 ; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0x7b, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] @@ -2199,7 +2196,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 16, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] @@ -2321,7 +2317,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0xffffc400, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] @@ -2456,7 +2451,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0x4400, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] @@ -2594,10 +2588,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffe0, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffe0, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffe00000, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -2718,10 +2711,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffe0, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffe00000, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -2969,10 +2961,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, -16, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, -16, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xfff00000, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -3093,10 +3084,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, -16, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xfff00000, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -3343,10 +3333,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffc400, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffc400, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xc4000000, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -3508,10 +3497,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0x4400, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0x4400, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0x44000000, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -3673,10 +3661,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0x4000, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0x4000, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 2.0, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -3801,10 +3788,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffc000, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffc000, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, -2.0, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -3926,9 +3912,8 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffe0, v2 -; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm