diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -603,6 +603,10 @@ bool matchUDivByConst(MachineInstr &MI); void applyUDivByConst(MachineInstr &MI); + // G_UMULH x, (1 << c)) -> x >> (bitwidth - c) + bool matchUMulHToLShr(MachineInstr &MI); + void applyUMulHToLShr(MachineInstr &MI); + /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -272,6 +272,11 @@ Register Src, const MachineRegisterInfo &MRI); +/// Tries to constant fold a G_CTLZ operation on \p Src. If \p Src is a vector +/// then it tries to do an element-wise constant fold. +Optional> +ConstantFoldCTLZ(Register Src, const MachineRegisterInfo &MRI); + /// Test if the given value is known to have exactly one bit set. This differs /// from computeKnownBits in that it doesn't necessarily determine which bit is /// set. diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -717,6 +717,14 @@ [{ return Helper.matchMulOBy2(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>; +def mulh_to_lshr : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_UMULH):$root, + [{ return Helper.matchUMulHToLShr(*${root}); }]), + (apply [{ Helper.applyUMulHToLShr(*${root}); }])>; + +def mulh_combines : GICombineGroup<[mulh_to_lshr]>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -763,7 +771,7 @@ const_combines, xor_of_and_with_same_reg, ptr_add_with_zero, shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine, truncstore_merge, div_rem_to_divrem, funnel_shift_combines, - form_bitfield_extract, constant_fold, intdiv_combines]>; + form_bitfield_extract, constant_fold, intdiv_combines, mulh_combines]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp --- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp @@ -13,6 +13,7 @@ #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/IR/DebugInfoMetadata.h" using namespace llvm; @@ -213,6 +214,22 @@ return buildFConstant(DstOps[0], *Cst); break; } + case TargetOpcode::G_CTLZ: { + assert(SrcOps.size() == 1 && "Expected one source"); + assert(DstOps.size() == 1 && "Expected one dest"); + auto MaybeCsts = ConstantFoldCTLZ(SrcOps[0].getReg(), *getMRI()); + if (!MaybeCsts) + break; + if (MaybeCsts->size() == 1) + return buildConstant(DstOps[0], (*MaybeCsts)[0]); + // This was a vector constant. Build a G_BUILD_VECTOR for them. + SmallVector ConstantRegs; + LLT VecTy = DstOps[0].getLLTTy(*getMRI()); + for (unsigned Cst : *MaybeCsts) + ConstantRegs.emplace_back( + buildConstant(VecTy.getScalarType(), Cst).getReg(0)); + return buildBuildVector(DstOps[0], ConstantRegs); + } } bool CanCopy = checkCopyToDefsPossible(DstOps); if (!canPerformCSEForOpc(Opc)) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -69,6 +69,16 @@ return I; } +/// Determines the LogBase2 value for a non-null input value using the +/// transform: LogBase2(V) = (EltBits - 1) - ctlz(V). +static Register buildLogBase2(Register V, MachineIRBuilder &MIB) { + auto &MRI = *MIB.getMRI(); + LLT Ty = MRI.getType(V); + auto Ctlz = MIB.buildCTLZ(Ty, V); + auto Base = MIB.buildConstant(Ty, Ty.getScalarSizeInBits() - 1); + return MIB.buildSub(Ty, Base, Ctlz).getReg(0); +} + /// \returns The big endian in-memory byte position of byte \p I in a /// \p ByteWidth bytes wide type. /// @@ -4557,6 +4567,36 @@ replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg()); } +bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_UMULH); + Register RHS = MI.getOperand(2).getReg(); + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + if (!isConstantOrConstantVector(*MRI.getVRegDef(RHS), MRI) || + !isKnownToBeAPowerOfTwo(RHS, MRI)) + return false; + return isLegalOrBeforeLegalizer({TargetOpcode::G_LSHR, {Ty, ShiftAmtTy}}); +} + +void CombinerHelper::applyUMulHToLShr(MachineInstr &MI) { + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + unsigned NumEltBits = Ty.getScalarSizeInBits(); + + Builder.setInstrAndDebugLoc(MI); + auto LogBase2 = buildLogBase2(RHS, Builder); + auto ShiftAmt = + Builder.buildSub(Ty, Builder.buildConstant(Ty, NumEltBits), LogBase2); + auto Trunc = Builder.buildZExtOrTrunc(ShiftAmtTy, ShiftAmt); + Builder.buildLShr(Dst, LHS, Trunc); + MI.eraseFromParent(); +} + + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -726,6 +726,37 @@ return None; } +Optional> +llvm::ConstantFoldCTLZ(Register Src, const MachineRegisterInfo &MRI) { + LLT Ty = MRI.getType(Src); + SmallVector FoldedCTLZs; + auto tryFoldScalar = [&](Register R) -> Optional { + auto MaybeCst = getIConstantVRegVal(R, MRI); + if (!MaybeCst) + return None; + return MaybeCst->countLeadingZeros(); + }; + if (Ty.isVector()) { + // Try to constant fold each element. + auto *BV = getOpcodeDef(Src, MRI); + if (!BV) + return None; + for (unsigned SrcIdx = 0; SrcIdx < BV->getNumSources(); ++SrcIdx) { + if (auto MaybeFold = tryFoldScalar(BV->getSourceReg(SrcIdx))) { + FoldedCTLZs.emplace_back(*MaybeFold); + continue; + } + return None; + } + return FoldedCTLZs; + } + if (auto MaybeCst = tryFoldScalar(Src)) { + FoldedCTLZs.emplace_back(*MaybeCst); + return FoldedCTLZs; + } + return None; +} + bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI, GISelKnownBits *KB) { Optional DefSrcReg = diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll @@ -22,14 +22,16 @@ ; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI0_1] ; GISEL-NEXT: adrp x8, .LCPI0_0 ; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI0_0] -; GISEL-NEXT: umull2 v3.4s, v0.8h, v1.8h -; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h -; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h -; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h -; GISEL-NEXT: umull2 v3.4s, v0.8h, v2.8h -; GISEL-NEXT: umull v0.4s, v0.4h, v2.4h -; GISEL-NEXT: uzp2 v0.8h, v0.8h, v3.8h -; GISEL-NEXT: add v0.8h, v0.8h, v1.8h +; GISEL-NEXT: adrp x8, .LCPI0_2 +; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] +; GISEL-NEXT: sub v1.8h, v2.8h, v1.8h +; GISEL-NEXT: neg v1.8h, v1.8h +; GISEL-NEXT: umull2 v2.4s, v0.8h, v3.8h +; GISEL-NEXT: umull v3.4s, v0.4h, v3.4h +; GISEL-NEXT: uzp2 v2.8h, v3.8h, v2.8h +; GISEL-NEXT: sub v0.8h, v0.8h, v2.8h +; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h +; GISEL-NEXT: add v0.8h, v0.8h, v2.8h ; GISEL-NEXT: ushr v0.8h, v0.8h, #4 ; GISEL-NEXT: ret %1 = udiv <8 x i16> %x, @@ -155,28 +157,30 @@ ; ; GISEL-LABEL: combine_vec_udiv_nonuniform3: ; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, .LCPI3_5 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_5] ; GISEL-NEXT: adrp x8, .LCPI3_4 -; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_4] -; GISEL-NEXT: adrp x8, .LCPI3_3 -; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_4] ; GISEL-NEXT: adrp x8, .LCPI3_2 ; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] ; GISEL-NEXT: adrp x8, .LCPI3_1 ; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI3_1] +; GISEL-NEXT: adrp x8, .LCPI3_3 +; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI3_3] ; GISEL-NEXT: adrp x8, .LCPI3_0 -; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI3_0] -; GISEL-NEXT: umull2 v6.4s, v0.8h, v2.8h +; GISEL-NEXT: ldr q6, [x8, :lo12:.LCPI3_0] +; GISEL-NEXT: sub v3.8h, v4.8h, v3.8h +; GISEL-NEXT: umull2 v4.4s, v0.8h, v2.8h ; GISEL-NEXT: umull v2.4s, v0.4h, v2.4h -; GISEL-NEXT: uzp2 v2.8h, v2.8h, v6.8h -; GISEL-NEXT: cmeq v1.8h, v1.8h, v5.8h -; GISEL-NEXT: sub v5.8h, v0.8h, v2.8h -; GISEL-NEXT: umull2 v6.4s, v5.8h, v3.8h -; GISEL-NEXT: umull v3.4s, v5.4h, v3.4h -; GISEL-NEXT: uzp2 v3.8h, v3.8h, v6.8h -; GISEL-NEXT: neg v4.8h, v4.8h +; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h +; GISEL-NEXT: neg v3.8h, v3.8h +; GISEL-NEXT: sub v4.8h, v0.8h, v2.8h +; GISEL-NEXT: cmeq v1.8h, v1.8h, v6.8h +; GISEL-NEXT: ushl v3.8h, v4.8h, v3.8h +; GISEL-NEXT: neg v5.8h, v5.8h ; GISEL-NEXT: shl v1.8h, v1.8h, #15 ; GISEL-NEXT: add v2.8h, v3.8h, v2.8h -; GISEL-NEXT: ushl v2.8h, v2.8h, v4.8h +; GISEL-NEXT: ushl v2.8h, v2.8h, v5.8h ; GISEL-NEXT: sshr v1.8h, v1.8h, #15 ; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b ; GISEL-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir @@ -34,17 +34,20 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16) ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]] ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]] - ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR1]] - ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]] - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR2]](<8 x s16>) - ; CHECK-NEXT: $q0 = COPY [[LSHR]](<8 x s16>) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 16 + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16) + ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(<8 x s16>) = G_SUB [[BUILD_VECTOR3]], [[BUILD_VECTOR2]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[SUB1]](<8 x s16>) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[LSHR]], [[UMULH]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR1]](<8 x s16>) + ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<8 x s16>) = COPY $q0 %2:_(s16) = G_CONSTANT i16 23 @@ -208,30 +211,33 @@ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 127 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16) ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363 - ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 - ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645 - ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 - ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 18351 - ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 12137 - ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115 - ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 23705 - ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 5 - ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 1041 - ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 517 - ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 6 - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C11]](s16), [[C13]](s16), [[C14]](s16), [[C15]](s16), [[C16]](s16), [[C18]](s16), [[C19]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C10]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C17]](s16), [[C17]](s16), [[C20]](s16) + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645 + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 18351 + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 12137 + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115 + ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 23705 + ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 5 + ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 1041 + ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 517 + ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 6 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C10]](s16), [[C12]](s16), [[C13]](s16), [[C14]](s16), [[C15]](s16), [[C17]](s16), [[C18]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C9]](s16), [[C11]](s16), [[C11]](s16), [[C11]](s16), [[C11]](s16), [[C16]](s16), [[C16]](s16), [[C19]](s16) ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR1]] ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]] - ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR2]] - ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]] - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR3]](<8 x s16>) - ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16) + ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s16) = G_CONSTANT i16 16 ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR4]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR]] + ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(<8 x s16>) = G_SUB [[BUILD_VECTOR4]], [[BUILD_VECTOR3]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[SUB1]](<8 x s16>) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[LSHR]], [[UMULH]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR2]](<8 x s16>) + ; CHECK-NEXT: [[C22:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR5]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR1]] ; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<8 x s16>) = COPY $q0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-umulh-to-lshr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-umulh-to-lshr.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-umulh-to-lshr.mir @@ -0,0 +1,55 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s + +--- +name: mul_to_lshr +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: mul_to_lshr + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 61 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C]](s64) + ; CHECK-NEXT: $x0 = COPY [[LSHR]](s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 8 + %2:_(s64) = G_UMULH %0, %1(s64) + $x0 = COPY %2(s64) +... +--- +name: mul_to_lshr_vector +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $q0 + ; CHECK-LABEL: name: mul_to_lshr_vector + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s32>) = G_SUB [[BUILD_VECTOR1]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C2]](s32), [[C2]](s32), [[C2]](s32), [[C2]](s32) + ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(<4 x s32>) = G_SUB [[BUILD_VECTOR2]], [[SUB]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[COPY]], [[SUB1]](<4 x s32>) + ; CHECK-NEXT: $q0 = COPY [[LSHR]](<4 x s32>) + %0:_(<4 x s32>) = COPY $q0 + %1:_(s32) = G_CONSTANT i32 8 + %bv:_(<4 x s32>) = G_BUILD_VECTOR %1, %1, %1, %1 + %2:_(<4 x s32>) = G_UMULH %0, %bv(<4 x s32>) + $q0 = COPY %2(<4 x s32>) +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -222,8 +222,7 @@ ; CHECK-LABEL: v_udiv_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, 0x100000 -; CHECK-NEXT: v_mul_hi_u32 v0, v0, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v0, 12, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv i32 %num, 4096 ret i32 %result @@ -233,9 +232,8 @@ ; CHECK-LABEL: v_udiv_v2i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x100000 -; CHECK-NEXT: v_mul_hi_u32 v0, v0, s4 -; CHECK-NEXT: v_mul_hi_u32 v1, v1, s4 +; CHECK-NEXT: v_lshrrev_b32_e32 v0, 12, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 12, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i32> %num, ret <2 x i32> %result @@ -257,38 +255,21 @@ } define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) { -; GISEL-LABEL: v_udiv_v2i32_oddk_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xb2a50881 -; GISEL-NEXT: s_brev_b32 s5, 1 -; GISEL-NEXT: v_mul_hi_u32 v2, v0, s4 -; GISEL-NEXT: v_mul_hi_u32 v3, v1, s4 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v0, s5 -; GISEL-NEXT: v_mul_hi_u32 v1, v1, s5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 20, v1 -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_udiv_v2i32_oddk_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xb2a50881 -; CGP-NEXT: v_mul_hi_u32 v2, v0, s4 -; CGP-NEXT: v_mul_hi_u32 v3, v1, s4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; CGP-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_lshrrev_b32_e32 v0, 20, v0 -; CGP-NEXT: v_lshrrev_b32_e32 v1, 20, v1 -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_udiv_v2i32_oddk_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0xb2a50881 +; CHECK-NEXT: v_mul_hi_u32 v2, v0, s4 +; CHECK-NEXT: v_mul_hi_u32 v3, v1, s4 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i32> %num, ret <2 x i32> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -969,27 +969,7 @@ ; CHECK-LABEL: v_udiv_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x100000 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v0 -; CHECK-NEXT: v_mul_hi_u32 v3, v0, 0 -; CHECK-NEXT: v_lshlrev_b32_e32 v4, 20, v1 -; CHECK-NEXT: v_mul_hi_u32 v5, v1, 0 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_mul_hi_u32 v0, v0, s4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v1, v1, s4 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 12 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %num, 4096 ret i64 %result @@ -999,54 +979,17 @@ ; GISEL-LABEL: v_udiv_v2i64_pow2k_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_lshr_b64 v[0:1], v[0:1], 12 -; GISEL-NEXT: v_lshr_b64 v[2:3], v[2:3], 12 +; GISEL-NEXT: s_sub_u32 s4, 63, 11 +; GISEL-NEXT: s_sub_u32 s4, 64, s4 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[0:1], s4 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[2:3], s4 ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0x100000 -; CGP-NEXT: v_lshlrev_b32_e32 v4, 20, v0 -; CGP-NEXT: v_mul_hi_u32 v5, v0, 0 -; CGP-NEXT: v_lshlrev_b32_e32 v6, 20, v1 -; CGP-NEXT: v_mul_hi_u32 v7, v1, 0 -; CGP-NEXT: v_lshlrev_b32_e32 v8, 20, v2 -; CGP-NEXT: v_mul_hi_u32 v9, v2, 0 -; CGP-NEXT: v_lshlrev_b32_e32 v10, 20, v3 -; CGP-NEXT: v_mul_hi_u32 v11, v3, 0 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v0, v0, s4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v1, v1, s4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v2, v2, s4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v3, v3, s4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v8 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_lshr_b64 v[0:1], v[0:1], 12 +; CGP-NEXT: v_lshr_b64 v[2:3], v[2:3], 12 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i64> %num, ret <2 x i64> %result diff --git a/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp b/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp --- a/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "GISelMITest.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "gtest/gtest.h" namespace { @@ -163,4 +166,46 @@ EXPECT_TRUE(CSEB.getInsertPt() == CSEB.getMBB().end()); } +TEST_F(AArch64GISelMITest, TestConstantFoldCTL) { + setUp(); + if (!TM) + return; + + LLT s32 = LLT::scalar(32); + + GISelCSEInfo CSEInfo; + CSEInfo.setCSEConfig(std::make_unique()); + CSEInfo.analyze(*MF); + B.setCSEInfo(&CSEInfo); + CSEMIRBuilder CSEB(B.getState()); + auto Cst8 = CSEB.buildConstant(s32, 8); + auto *CtlzDef = &*CSEB.buildCTLZ(s32, Cst8); + EXPECT_TRUE(CtlzDef->getOpcode() == TargetOpcode::G_CONSTANT); + EXPECT_TRUE(CtlzDef->getOperand(1).getCImm()->getZExtValue() == 28); + + // Test vector. + auto Cst16 = CSEB.buildConstant(s32, 16); + auto Cst32 = CSEB.buildConstant(s32, 32); + auto Cst64 = CSEB.buildConstant(s32, 64); + LLT VecTy = LLT::fixed_vector(4, s32); + auto BV = CSEB.buildBuildVector(VecTy, {Cst8.getReg(0), Cst16.getReg(0), + Cst32.getReg(0), Cst64.getReg(0)}); + CSEB.buildCTLZ(VecTy, BV); + + auto CheckStr = R"( + ; CHECK: [[CST8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[CST28:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; CHECK: [[CST16:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[CST32:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[CST64:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[BV1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[CST8]]:_(s32), [[CST16]]:_(s32), [[CST32]]:_(s32), [[CST64]]:_(s32) + ; CHECK: [[CST27:%[0-9]+]]:_(s32) = G_CONSTANT i32 27 + ; CHECK: [[CST26:%[0-9]+]]:_(s32) = G_CONSTANT i32 26 + ; CHECK: [[CST25:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 + ; CHECK: [[BV2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[CST28]]:_(s32), [[CST27]]:_(s32), [[CST26]]:_(s32), [[CST25]]:_(s32) + )"; + + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; +} + } // namespace