diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -302,6 +302,8 @@ void applyShiftOfShiftedLogic(MachineInstr &MI, ShiftOfShiftedLogic &MatchInfo); + bool matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Transform a multiply by a power-of-2 value to a left shift. bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); void applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4034,6 +4034,19 @@ return true; } + /// GlobalISel - return true if it is profitable to move this shift by a + /// constant amount through its operand, adjusting any immediate operands as + /// necessary to preserve semantics. This transformation may not be desirable + /// if it disrupts a particularly auspicious target-specific tree (e.g. + /// bitfield extraction in AArch64). By default, it returns true. + /// + /// @param N the shift node + /// @param IsAfterLegal true if running after legalization. + virtual bool isDesirableToCommuteWithShift(const MachineInstr &MI, + bool IsAfterLegal) const { + return true; + } + // Return AndOrSETCCFoldKind::{AddAnd, ABS} if its desirable to try and // optimize LogicOp(SETCC0, SETCC1). An example (what is implemented as of // writing this) is: diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -243,6 +243,14 @@ [{ return Helper.matchCombineShlOfExtend(*${mi}, ${matchinfo}); }]), (apply [{ Helper.applyCombineShlOfExtend(*${mi}, ${matchinfo}); }])>; +// Combine (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) +// Combine (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) +def commute_shift : GICombineRule< + (defs root:$d, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_SHL):$d, + [{ return Helper.matchCommuteShift(*${d}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${d}, ${matchinfo}); }])>; + def narrow_binop_feeding_and : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), (match (wip_match_opcode G_AND):$root, @@ -1097,7 +1105,7 @@ unmerge_zext_to_zext, merge_unmerge, trunc_ext_fold, trunc_shift, const_combines, xor_of_and_with_same_reg, ptr_add_with_zero, shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine, - div_rem_to_divrem, funnel_shift_combines, + div_rem_to_divrem, funnel_shift_combines, commute_shift, form_bitfield_extract, constant_fold, fabs_fneg_fold, intdiv_combines, mulh_combines, redundant_neg_operands, and_or_disjoint_mask, fma_combines, fold_binop_into_select, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1624,6 +1624,41 @@ MI.eraseFromParent(); } +bool CombinerHelper::matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHL && "Expected G_SHL"); + // Combine (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) + // Combine (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) + auto &Shl = cast(MI); + Register DstReg = Shl.getReg(0); + Register SrcReg = Shl.getReg(1); + Register ShiftReg = Shl.getReg(2); + Register X, C1; + + if (!mi_match(SrcReg, MRI, + m_OneNonDBGUse(m_any_of(m_GAdd(m_Reg(X), m_Reg(C1)), + m_GOr(m_Reg(X), m_Reg(C1)))))) + return false; + + APInt C1Val, C2Val; + if (!mi_match(C1, MRI, m_ICstOrSplat(C1Val)) || + !mi_match(ShiftReg, MRI, m_ICstOrSplat(C2Val))) + return false; + + if (!getTargetLowering().isDesirableToCommuteWithShift(MI, !isPreLegalize())) + return false; + + auto *SrcDef = MRI.getVRegDef(SrcReg); + assert(SrcDef->getOpcode() == TargetOpcode::G_ADD || + SrcDef->getOpcode() == TargetOpcode::G_OR && "Unexpected op"); + LLT SrcTy = MRI.getType(SrcReg); + MatchInfo = [=](MachineIRBuilder &B) { + auto S1 = B.buildShl(SrcTy, X, ShiftReg); + auto S2 = B.buildShl(SrcTy, C1, ShiftReg); + B.buildInstr(SrcDef->getOpcode(), {DstReg}, {S1, S2}); + }; + return true; +} + bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) { assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-commute-shift.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-commute-shift.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-commute-shift.mir @@ -0,0 +1,128 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple aarch64 -mattr=+fullfp16 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +--- +name: shl_add_k +alignment: 4 +tracksRegLiveness: true +body: | + bb.1: + liveins: $w1, $x0 + + ; CHECK-LABEL: name: shl_add_k + ; CHECK: liveins: $w1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[C1]] + ; CHECK-NEXT: G_STORE [[ADD]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(s32) = COPY $w1 + %2:_(s32) = G_CONSTANT i32 1 + %4:_(s32) = G_CONSTANT i32 2 + %3:_(s32) = G_ADD %1, %2 + %5:_(s32) = G_SHL %3, %4(s32) + G_STORE %5(s32), %0(p0) :: (store (s32)) + RET_ReallyLR + +... +--- +name: shl_or_k +alignment: 4 +tracksRegLiveness: true +body: | + bb.1: + liveins: $w1, $x0 + + ; CHECK-LABEL: name: shl_or_k + ; CHECK: liveins: $w1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[C1]] + ; CHECK-NEXT: G_STORE [[OR]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(s32) = COPY $w1 + %2:_(s32) = G_CONSTANT i32 1 + %4:_(s32) = G_CONSTANT i32 2 + %3:_(s32) = G_OR %1, %2 + %5:_(s32) = G_SHL %3, %4(s32) + G_STORE %5(s32), %0(p0) :: (store (s32)) + RET_ReallyLR + +... +--- +name: shl_or_k_multiuse +alignment: 4 +tracksRegLiveness: true +body: | + bb.1: + liveins: $w1, $x0 + + ; CHECK-LABEL: name: shl_or_k_multiuse + ; CHECK: liveins: $w1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: %ptr:_(p0) = COPY $x1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[C]] + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[OR]], [[C1]](s32) + ; CHECK-NEXT: G_STORE [[SHL]](s32), [[COPY]](p0) :: (store (s32)) + ; CHECK-NEXT: G_STORE [[OR]](s32), %ptr(p0) :: (store (s32)) + ; CHECK-NEXT: RET_ReallyLR + %0:_(p0) = COPY $x0 + %ptr:_(p0) = COPY $x1 + %1:_(s32) = COPY $w1 + %2:_(s32) = G_CONSTANT i32 1 + %4:_(s32) = G_CONSTANT i32 2 + %3:_(s32) = G_OR %1, %2 + %5:_(s32) = G_SHL %3, %4(s32) + G_STORE %5(s32), %0(p0) :: (store (s32)) + G_STORE %3(s32), %ptr(p0) :: (store (s32)) + RET_ReallyLR + +... +--- +name: shl_add_k_vector +alignment: 4 +tracksRegLiveness: true +body: | + bb.1: + liveins: $w1, $x0 + + ; CHECK-LABEL: name: shl_add_k_vector + ; CHECK: liveins: $w1, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK-NEXT: %xvec:_(<4 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY1]](s32), [[COPY1]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: %veccst2:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<4 x s32>) = G_SHL %xvec, %veccst2(<4 x s32>) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[SHL]], [[BUILD_VECTOR]] + ; CHECK-NEXT: G_STORE [[ADD]](<4 x s32>), [[COPY]](p0) :: (store (<4 x s32>)) + ; CHECK-NEXT: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(s32) = COPY $w1 + %xvec:_(<4 x s32>) = G_BUILD_VECTOR %1, %1, %1, %1 + %2:_(s32) = G_CONSTANT i32 1 + %veccst:_(<4 x s32>) = G_BUILD_VECTOR %2, %2, %2, %2 + %4:_(s32) = G_CONSTANT i32 2 + %veccst2:_(<4 x s32>) = G_BUILD_VECTOR %4, %4, %4, %4 + %3:_(<4 x s32>) = G_ADD %xvec, %veccst2 + %5:_(<4 x s32>) = G_SHL %3, %veccst2 + G_STORE %5(<4 x s32>), %0(p0) :: (store (<4 x s32>)) + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll @@ -101,19 +101,19 @@ define amdgpu_ps float @add_shl_vgpr_const_inline_const(i32 %a) { ; VI-LABEL: add_shl_vgpr_const_inline_const: ; VI: ; %bb.0: -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x3f4, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7e800, v0 ; VI-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: add_shl_vgpr_const_inline_const: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f4 -; GFX9-NEXT: v_add_lshl_u32 v0, v0, v1, 9 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e800 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 9, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: add_shl_vgpr_const_inline_const: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_lshl_u32 v0, 0x3f4, v0, 9 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 9, 0x7e800 ; GFX10-NEXT: ; return to shader part epilog %x = add i32 %a, 1012 %result = shl i32 %x, 9 @@ -124,18 +124,19 @@ define amdgpu_ps float @add_shl_vgpr_inline_const_x2(i32 %a) { ; VI-LABEL: add_shl_vgpr_inline_const_x2: ; VI: ; %bb.0: -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x600, v0 ; VI-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: add_shl_vgpr_inline_const_x2: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_add_lshl_u32 v0, v0, 3, 9 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x600 +; GFX9-NEXT: v_lshl_add_u32 v0, v0, 9, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: add_shl_vgpr_inline_const_x2: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_lshl_u32 v0, v0, 3, 9 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 9, 0x600 ; GFX10-NEXT: ; return to shader part epilog %x = add i32 %a, 3 %result = shl i32 %x, 9