Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -2246,6 +2246,14 @@ bool doesNotReturn = false, bool isReturnValueUsed = true) const; + struct TargetLoweringOpt; + + // Type of call-back function that does target-specific constant optimization. + typedef std::function + ConstOptFtorT; + + virtual ConstOptFtorT getConstOptFtor() const { return ConstOptFtorT(); } + /// Check whether parameters to a call that are passed in callee saved /// registers are the same as from the calling function. This needs to be /// checked for tail call eligibility. @@ -2267,10 +2275,11 @@ bool LegalOps; SDValue Old; SDValue New; + ConstOptFtorT ConstOptFtor; - explicit TargetLoweringOpt(SelectionDAG &InDAG, - bool LT, bool LO) : - DAG(InDAG), LegalTys(LT), LegalOps(LO) {} + explicit TargetLoweringOpt(SelectionDAG &InDAG, bool LT, bool LO, + ConstOptFtorT Ftor = ConstOptFtorT()) : + DAG(InDAG), LegalTys(LT), LegalOps(LO), ConstOptFtor(Ftor) {} bool LegalTypes() const { return LegalTys; } bool LegalOperations() const { return LegalOps; } Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -925,7 +925,8 @@ /// Check the specified integer node value to see if it can be simplified or if /// things it uses can be simplified by bit propagation. If so, return true. bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) { - TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); + TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations, + TLI.getConstOptFtor()); APInt KnownZero, KnownOne; if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) return false; Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -341,6 +341,21 @@ const APInt &Demanded) { SDLoc dl(Op); + if (Op.getOpcode() == ISD::XOR) { + ConstantSDNode *C = dyn_cast(Op.getOperand(1)); + + // If an XOR already has all the bits set, nothing to change but don't + // shrink either. + if (!C || (C->getAPIntValue() | (~Demanded)).isAllOnesValue()) + return false; + } + + assert(!Old.getNode() && !New.getNode()); + + // Call ConstOptFtor and do target-specific constant optimization. + if (ConstOptFtor && ConstOptFtor(Op, Demanded, *this)) + return New.getNode(); + // FIXME: ISD::SELECT, ISD::SELECT_CC switch (Op.getOpcode()) { default: break; @@ -350,10 +365,6 @@ ConstantSDNode *C = dyn_cast(Op.getOperand(1)); if (!C) return false; - if (Op.getOpcode() == ISD::XOR && - (C->getAPIntValue() | (~Demanded)).isAllOnesValue()) - return false; - // if we can expand it to have all bits set, do it if (C->getAPIntValue().intersects(~Demanded)) { EVT VT = Op.getValueType(); Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -239,6 +239,8 @@ APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth = 0) const override; + ConstOptFtorT getConstOptFtor() const override; + MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; /// Returns true if the target allows unaligned memory accesses of the Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -39,6 +39,7 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); +STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); static cl::opt EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, @@ -53,6 +54,12 @@ cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +static cl::opt +EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, + cl::desc("Enable AArch64 logical imm instruction " + "optimization"), + cl::init(true)); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -731,6 +738,141 @@ return VT.changeVectorElementTypeToInteger(); } +static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, + const APInt &Demanded, + TargetLowering::TargetLoweringOpt &TLO, + unsigned NewOpc) { + if (!EnableOptimizeLogicalImm) + return false; + + uint64_t OldImm = Imm, NewImm, Enc; + uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)); + + // Return if the immediate is already a bimm32 or bimm64. + if (AArch64_AM::processLogicalImmediate(Imm & Mask, Size, Enc)) + return false; + + unsigned EltSize = Size; + uint64_t DemandedBits = Demanded.getZExtValue(); + + // Clear bits that are not demanded. + Imm &= DemandedBits; + + while (true) { + // The goal here is to set the non-demanded bits in a way that minimizes + // the number of switching between 0 and 1. In order to achieve this goal, + // we set the non-demanded bits to the value of the preceding demanded bits. + // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a + // non-demanded bit), we copy bit0 (1) to the least significant 'x', + // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. + // The final result is 0b11000011. + uint64_t NonDemandedBits = ~DemandedBits; + uint64_t InvertedImm = ~Imm & DemandedBits; + uint64_t RotatedImm = + ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & + NonDemandedBits; + uint64_t Sum = RotatedImm + NonDemandedBits; + bool Carry = NonDemandedBits & ~Sum & (1 << (EltSize - 1)); + uint64_t Ones = (Sum + Carry) & NonDemandedBits; + NewImm = (Imm | Ones) & Mask; + + // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate + // or all-ones or all-zeros, in which case we can stop searching. Otherwise, + // we halve the element size and continue the search. + if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) + break; + + // We cannot shrink the element size any further if it is 2-bits. + if (EltSize == 2) + return false; + + EltSize /= 2; + Mask >>= EltSize; + uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; + + // Return if there is mismatch in any of the demanded bits of Imm and Hi. + if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) + return false; + + // Merge the upper and lower halves of Imm and DemandedBits. + Imm |= Hi; + DemandedBits |= DemandedBitsHi; + } + + ++NumOptimizedImms; + + // Replicate the element across the register width. + while (EltSize < Size) { + NewImm |= NewImm << EltSize; + EltSize *= 2; + } + + (void)OldImm; + assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && + "demanded bits should never be altered"); + + // Create the new constant immediate node. + EVT VT = Op.getValueType(); + unsigned Population = countPopulation(NewImm); + SDLoc DL(Op); + + // If the new constant immediate is all-zeros or all-ones, let the target + // independent DAG combine optimize this node. + if (Population == 0 || Population == Size) + return TLO.CombineTo(Op.getOperand(1), TLO.DAG.getConstant(NewImm, DL, VT)); + + // Otherwise, create a machine node so that target independent DAG combine + // doesn't undo this optimization. + Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); + SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); + SDValue New( + TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); + + return TLO.CombineTo(Op, New); +} + +static bool aarch64ConstOptFunc(SDValue Op, const APInt &Demanded, + TargetLowering::TargetLoweringOpt &TLO) { + // Delay this optimization to as late as possible. + if (!TLO.LegalOps) + return false; + + EVT VT = Op.getValueType(); + if (VT.isVector()) + return false; + + // FIXME: Check if Op has operand 1? + ConstantSDNode *C = dyn_cast(Op.getOperand(1)); + if (!C) + return false; + + unsigned Size = VT.getSizeInBits(); + assert((Size == 32 || Size == 64) && + "i32 or i64 is expected after legalization."); + uint64_t Imm = C->getZExtValue(); + unsigned NewOpc; + + switch (Op.getOpcode()) { + case ISD::AND: + NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; + break; + case ISD::OR: + NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; + break; + case ISD::XOR: + NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; + break; + default: + return false; + } + + return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc); +} + +TargetLowering::ConstOptFtorT AArch64TargetLowering::getConstOptFtor() const { + return aarch64ConstOptFunc; +} + /// computeKnownBitsForTargetNode - Determine which of the bits specified in /// Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. Index: test/CodeGen/AArch64/bitreverse.ll =================================================================== --- test/CodeGen/AArch64/bitreverse.ll +++ test/CodeGen/AArch64/bitreverse.ll @@ -28,10 +28,8 @@ ; CHECK-DAG: lsr [[S2:w.*]], [[H2]], #2 ; CHECK-DAG: orr [[R2:w.*]], [[S2]], [[L2]], lsl #2 -; CHECK-DAG: mov [[P1:w.*]], #1426063360 -; CHECK-DAG: mov [[N1:w.*]], #-1442840576 -; CHECK-DAG: and [[L1:w.*]], [[R2]], [[P1]] -; CHECK-DAG: and [[H1:w.*]], [[R2]], [[N1]] +; CHECK-DAG: and [[L1:w.*]], [[R2]], #0x55555555 +; CHECK-DAG: and [[H1:w.*]], [[R2]], #0xaaaaaaaa ; CHECK-DAG: lsr [[S1:w.*]], [[H1]], #1 ; CHECK-DAG: orr [[R1:w.*]], [[S1]], [[L1]], lsl #1 Index: test/CodeGen/AArch64/optimize-imm.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/optimize-imm.ll @@ -0,0 +1,64 @@ +; RUN: llc -o - %s -mtriple=aarch64-unknown-unknown | FileCheck %s + +; CHECK-LABEL: and1: +; CHECK: and {{w[0-9]+}}, w0, #0xfffffffd + +define void @and1(i32 %a, i8* nocapture %p) { +entry: + %and = and i32 %a, 253 + %conv = trunc i32 %and to i8 + store i8 %conv, i8* %p, align 1 + ret void +} + +; (a & 0x3dfd) | 0xffffc000 +; +; CHECK-LABEL: and2: +; CHECK: and {{w[0-9]+}}, w0, #0xfdfdfdfd + +define i32 @and2(i32 %a) { +entry: + %and = and i32 %a, 15869 + %or = or i32 %and, -16384 + ret i32 %or +} + +; (a & 0x19) | 0xffffffc0 +; +; CHECK-LABEL: and3: +; CHECK: and {{w[0-9]+}}, w0, #0x99999999 + +define i32 @and3(i32 %a) { +entry: + %and = and i32 %a, 25 + %or = or i32 %and, -64 + ret i32 %or +} + +; (a & 0xc5600) | 0xfff1f1ff +; +; CHECK-LABEL: and4: +; CHECK: and {{w[0-9]+}}, w0, #0xfffc07ff + +define i32 @and4(i32 %a) { +entry: + %and = and i32 %a, 787968 + %or = or i32 %and, -921089 + ret i32 %or +} + +; Make sure we don't shrink or optimize an XOR's immediate operand if the +; immediate is -1. Instruction selection turns (and ((xor $mask, -1), $v0)) into +; a BIC. + +; CHECK-LABEL: xor1: +; CHECK: orr [[R0:w[0-9]+]], wzr, #0x38 +; CHECK: bic {{w[0-9]+}}, [[R0]], w0, lsl #3 + +define i32 @xor1(i32 %a) { +entry: + %shl = shl i32 %a, 3 + %xor = and i32 %shl, 56 + %and = xor i32 %xor, 56 + ret i32 %and +}