Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -2045,6 +2045,14 @@ SDLoc dl, bool doesNotReturn = false, bool isReturnValueUsed = true) const; + struct TargetLoweringOpt; + + // Type of call-back function that does target-specific constant optimization. + typedef std::function + ConstOptFtorT; + + virtual ConstOptFtorT getConstOptFtor() const { return ConstOptFtorT(); } + //===--------------------------------------------------------------------===// // TargetLowering Optimization Methods // @@ -2058,10 +2066,11 @@ bool LegalOps; SDValue Old; SDValue New; + ConstOptFtorT ConstOptFtor; - explicit TargetLoweringOpt(SelectionDAG &InDAG, - bool LT, bool LO) : - DAG(InDAG), LegalTys(LT), LegalOps(LO) {} + explicit TargetLoweringOpt(SelectionDAG &InDAG, bool LT, bool LO, + ConstOptFtorT Ftor = ConstOptFtorT()) : + DAG(InDAG), LegalTys(LT), LegalOps(LO), ConstOptFtor(Ftor) {} bool LegalTypes() const { return LegalTys; } bool LegalOperations() const { return LegalOps; } Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -870,7 +870,8 @@ /// Check the specified integer node value to see if it can be simplified or if /// things it uses can be simplified by bit propagation. If so, return true. bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) { - TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); + TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations, + TLI.getConstOptFtor()); APInt KnownZero, KnownOne; if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) return false; Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -286,6 +286,21 @@ const APInt &Demanded) { SDLoc dl(Op); + if (Op.getOpcode() == ISD::XOR) { + ConstantSDNode *C = dyn_cast(Op.getOperand(1)); + + // If an XOR already has all the bits set, nothing to change but don't + // shrink either. + if (!C || (C->getAPIntValue() | (~Demanded)).isAllOnesValue()) + return false; + } + + assert(!Old.getNode() && !New.getNode()); + + // Call ConstOptFtor and do target-specific constant optimization. + if (ConstOptFtor && ConstOptFtor(Op, Demanded, *this)) + return New.getNode(); + // FIXME: ISD::SELECT, ISD::SELECT_CC switch (Op.getOpcode()) { default: break; @@ -295,10 +310,6 @@ ConstantSDNode *C = dyn_cast(Op.getOperand(1)); if (!C) return false; - if (Op.getOpcode() == ISD::XOR && - (C->getAPIntValue() | (~Demanded)).isAllOnesValue()) - return false; - // if we can expand it to have all bits set, do it if (C->getAPIntValue().intersects(~Demanded)) { EVT VT = Op.getValueType(); Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -233,6 +233,8 @@ APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth = 0) const override; + ConstOptFtorT getConstOptFtor() const override; + MVT getScalarShiftAmountTy(EVT LHSTy) const override; /// allowsMisalignedMemoryAccesses - Returns true if the target allows Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -38,6 +38,7 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); +STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); namespace { enum AlignMode { @@ -710,6 +711,143 @@ return VT.changeVectorElementTypeToInteger(); } +static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, + const APInt &Demanded, + TargetLowering::TargetLoweringOpt &TLO, + unsigned NewOpc) { + uint64_t OldImm = Imm, NewImm, Enc; + uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)); + + // Return if the immediate is already a bimm32 or bimm64. + if (AArch64_AM::processLogicalImmediate(Imm & Mask, Size, Enc)) + return true; + + unsigned EltSize = Size; + uint64_t DemandedBits = Demanded.getZExtValue(); + + // Clear bits that are not demanded. + Imm &= DemandedBits; + + while (true) { + // The goal here is to set the non-demanded bits in a way that minimizes + // the number of switching between 0 and 1. In order to achieve this goal, + // we set the non-demanded bits to the value of the preceding demanded bits. + // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a + // non-demanded bit), we copy bit0 (1) to the least significant 'x', + // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. + // The final result is 0b11000011. + uint64_t NonDemandedBits = ~DemandedBits; + uint64_t InvertedImm = ~Imm & DemandedBits; + uint64_t RotatedImm = + ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & + NonDemandedBits; + uint64_t Sum = RotatedImm + NonDemandedBits; + + // There is a carry if the MSBs of NonDemandedBits and Sum are 1 and 0 + // respectively. + uint64_t Carry = ((NonDemandedBits & ~Sum) >> (EltSize - 1)) & 1; + uint64_t NonDemandedBitPattern = (Sum + Carry) & NonDemandedBits; + + // Compute the new immediate. + NewImm = (Imm | NonDemandedBitPattern) & Mask; + + // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate + // or all-ones or all-zeros, in which case we can stop searching. Otherwise, + // we halve the element size and continue the search. + if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) + break; + + // Don't shrink element size below 2-bits. + if (EltSize == 2) + return false; + + EltSize /= 2; + Mask >>= EltSize; + uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; + + // Return if there is mismatch in any of the demanded bits of Imm and Hi. + if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) + return false; + + // Merge the upper and lower halves of Imm and DemandedBits. + Imm |= Hi; + DemandedBits |= DemandedBitsHi; + } + + ++NumOptimizedImms; + + // Replicate the element across the register width. + while (EltSize < Size) { + NewImm |= NewImm << EltSize; + EltSize *= 2; + } + + (void)OldImm; + assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && + "demanded bits should never be altered"); + + // Create the new constant immediate node. + EVT VT = Op.getValueType(); + unsigned Population = countPopulation(NewImm); + SDLoc DL(Op); + + // If NewImm is all 0s or all 1s, replace the imm with NewImm and let the + // target-independent DAGCombine optimize it further. + if (Population == 0 || Population == Size) + return TLO.CombineTo(Op.getOperand(1), TLO.DAG.getConstant(NewImm, DL, VT)); + + // Otherwise, emit an aarch64 machine node to prevent the target-independent + // DAGCombine from undoing the optimization. + Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); + SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); + SDValue New( + TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); + + return TLO.CombineTo(Op, New); +} + +static bool aarch64ConstOptFunc(SDValue Op, const APInt &Demanded, + TargetLowering::TargetLoweringOpt &TLO) { + // Delay this optimization to as late as possible. + if (!TLO.LegalOps) + return false; + + EVT VT = Op.getValueType(); + if (VT.isVector()) + return false; + + assert(Op.getNumOperands() > 1 && "More than one operand expected."); + ConstantSDNode *C = dyn_cast(Op.getOperand(1)); + if (!C) + return false; + + unsigned Size = VT.getSizeInBits(); + assert((Size == 32 || Size == 64) && + "i32 or i64 is expected after legalization."); + uint64_t Imm = C->getZExtValue(); + unsigned NewOpc; + + switch (Op.getOpcode()) { + case ISD::AND: + NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; + break; + case ISD::OR: + NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; + break; + case ISD::XOR: + NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; + break; + default: + return false; + } + + return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc); +} + +TargetLowering::ConstOptFtorT AArch64TargetLowering::getConstOptFtor() const { + return aarch64ConstOptFunc; +} + /// computeKnownBitsForTargetNode - Determine which of the bits specified in /// Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. Index: test/CodeGen/AArch64/optimize-imm.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/optimize-imm.ll @@ -0,0 +1,64 @@ +; RUN: llc -o - %s -march=aarch64 | FileCheck %s + +; CHECK-LABEL: _and1: +; CHECK: and {{w[0-9]+}}, w0, #0xfffffffd + +define void @and1(i32 %a, i8* nocapture %p) { +entry: + %and = and i32 %a, 253 + %conv = trunc i32 %and to i8 + store i8 %conv, i8* %p, align 1 + ret void +} + +; (a & 0x3dfd) | 0xffffc000 +; +; CHECK-LABEL: _and2: +; CHECK: and {{w[0-9]+}}, w0, #0xfdfdfdfd + +define i32 @and2(i32 %a) { +entry: + %and = and i32 %a, 15869 + %or = or i32 %and, -16384 + ret i32 %or +} + +; (a & 0x19) | 0xffffffc0 +; +; CHECK-LABEL: _and3: +; CHECK: and {{w[0-9]+}}, w0, #0x99999999 + +define i32 @and3(i32 %a) { +entry: + %and = and i32 %a, 25 + %or = or i32 %and, -64 + ret i32 %or +} + +; (a & 0xc5600) | 0xfff1f1ff +; +; CHECK-LABEL: _and4: +; CHECK: and {{w[0-9]+}}, w0, #0xfffc07ff + +define i32 @and4(i32 %a) { +entry: + %and = and i32 %a, 787968 + %or = or i32 %and, -921089 + ret i32 %or +} + +; Make sure we don't shrink or optimize an XOR's immediate operand if the +; immediate is -1. Instruction selection turns (and ((xor $mask, -1), $v0)) into +; a BIC. + +; CHECK-LABEL: _xor1: +; CHECK: orr [[R0:w[0-9]+]], wzr, #0x38 +; CHECK: bic {{w[0-9]+}}, [[R0]], w0, lsl #3 + +define i32 @xor1(i32 %a) { +entry: + %shl = shl i32 %a, 3 + %xor = and i32 %shl, 56 + %and = xor i32 %xor, 56 + ret i32 %and +}