Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -2071,6 +2071,15 @@ APInt &KnownZero, APInt &KnownOne, TargetLoweringOpt &TLO, unsigned Depth = 0) const; + /// Do a target-specific optimization of Op's constant immediate operand using + /// the demanded bits information. For example, targets can sign-extend the + /// immediate in order to have it folded into the immediate field of the + /// instruction. Return true if the immediate node doesn't need further + /// optimization and set the New and Old fields of TLO if a new immediate + /// node was created. + virtual bool optimizeConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const; + /// Determine which of the bits specified in Mask are known to be either zero /// or one and return them in the KnownZero/KnownOne bitsets. virtual void computeKnownBitsForTargetNode(const SDValue Op, Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3424,11 +3424,11 @@ if (ROR.getNode()) return ROR; // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) - // iff (c1 & c2) == 0. + // iff (c1 & c2) == c1. if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && isa(N0.getOperand(1))) { ConstantSDNode *C1 = cast(N0.getOperand(1)); - if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0) { + if ((C1->getAPIntValue() & N1C->getAPIntValue()) == C1->getAPIntValue()) { SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1); if (!COR.getNode()) return SDValue(); Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -284,6 +284,21 @@ const APInt &Demanded) { SDLoc dl(Op); + if (Op.getOpcode() == ISD::XOR) { + ConstantSDNode *C = dyn_cast(Op.getOperand(1)); + + // If an XOR already has all the bits set, nothing to change but don't + // shrink either. + if (!C || (C->getAPIntValue() | (~Demanded)).isAllOnesValue()) + return false; + } + + assert(!Old.getNode() && !New.getNode()); + + // Return if the constant operand doesn't need further optimization. + if (DAG.getTargetLoweringInfo().optimizeConstant(Op, Demanded, *this)) + return New.getNode(); + // FIXME: ISD::SELECT, ISD::SELECT_CC switch (Op.getOpcode()) { default: break; @@ -293,10 +308,6 @@ ConstantSDNode *C = dyn_cast(Op.getOperand(1)); if (!C) return false; - if (Op.getOpcode() == ISD::XOR && - (C->getAPIntValue() | (~Demanded)).isAllOnesValue()) - return false; - // if we can expand it to have all bits set, do it if (C->getAPIntValue().intersects(~Demanded)) { EVT VT = Op.getValueType(); @@ -1082,6 +1093,11 @@ return false; } +bool TargetLowering::optimizeConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const { + return false; +} + /// computeKnownBitsForTargetNode - Determine which of the bits specified /// in Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -212,6 +212,9 @@ /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; + bool optimizeConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const override; + /// computeKnownBitsForTargetNode - Determine which of the bits specified in /// Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -37,6 +37,7 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); +STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); namespace { enum AlignMode { @@ -664,6 +665,129 @@ return VT.changeVectorElementTypeToInteger(); } +static bool optimizeConstant(unsigned Size, uint64_t Imm, uint64_t &NewImm, + const APInt &Demanded) { + uint64_t Enc; + uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)); + + // Return if the immediate is already a bimm32 or bimm64. + if (AArch64_AM::processLogicalImmediate(Imm & Mask, Size, Enc)) { + NewImm = Imm; + return true; + } + + unsigned EltSize = Size; + uint64_t DemandedBits = Demanded.getZExtValue(); + + // Clear bits that are not demanded. + Imm &= DemandedBits; + + while (true) { + // The goal here is to set the non-demanded bits in a way that minimizes + // the number of switching between 0 and 1. In order to achieve this goal, + // we set the non-demanded bits to the value of the preceding demanded bits. + // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a + // non-demanded bit), we copy bit0 (1) to the least significant 'x', + // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. + // The final result is 0b11000011. + uint64_t NonDemandedBits = ~DemandedBits; + uint64_t Ones = + (NonDemandedBits + ((~Imm & DemandedBits) << 1)) & NonDemandedBits; + + // We cannot copy the preceding demanded bit if the rightmost bits are not + // demanded. In that case, we replicate the least significant demanded bit + // that follows the trailing non-demanded bits. + if (Ones & 1) { + bool FirstDemandedBitIsZero = ~Imm & ((Ones + 1 | Ones) ^ Ones) & Mask; + + // If the first demanded bit is zero, clear the trailing ones in Ones. + if (FirstDemandedBitIsZero) + Ones = (Ones + 1) & Ones; + } + + uint64_t TmpImm = (Imm | Ones) & Mask; + + // If TmpImm or its bitwise NOT is a shifted mask, it is a bitmask immediate + // or all-ones or all-zeros, in which case we can stop searching. Otherwise, + // we halve the element size and continue the search. + if (isShiftedMask_64(TmpImm) || isShiftedMask_64(~(TmpImm | ~Mask))) { + Imm = TmpImm; + break; + } + + // We cannot shrink the element size any further if it is 2-bits. + if (EltSize == 2) + return false; + + EltSize /= 2; + Mask >>= EltSize; + uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; + + // Return if there is mismatch in any of the demanded bits of Imm and Hi. + if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) + return false; + + // Merge the upper and lower halves of Imm and DemandedBits. + Imm |= Hi; + DemandedBits |= DemandedBitsHi; + } + + // Replicate the element across the register width. + while (EltSize < Size) { + Imm |= Imm << EltSize; + EltSize *= 2; + } + + NewImm = Imm; + return true; +} + +bool AArch64TargetLowering::optimizeConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const { + // Delay this optimization to as late as possible. + if (!TLO.LegalOps) + return false; + + switch (Op.getOpcode()) { + default: break; + case ISD::AND: + case ISD::OR: + case ISD::XOR: + ConstantSDNode *C = dyn_cast(Op.getOperand(1)); + if (!C) + break; + + EVT VT = Op.getValueType(); + if (VT.isVector()) + break; + + unsigned Size = VT.getSizeInBits(); + assert((Size == 32 || Size == 64) && + "i32 or i64 is expected after legalization."); + uint64_t Imm = C->getZExtValue(), NewImm; + + // Break if we cannot turn Imm into a bitmask immediate. + if (!::optimizeConstant(Size, Imm, NewImm, Demanded)) + break; + + // Return if Imm was already a bitmask immediate. + if (Imm == NewImm) + return true; + + ++NumOptimizedImms; + assert(((NewImm ^ Imm) & Demanded.getZExtValue()) == 0 && + "demanded bits should never be altered"); + + // Create the new constant immediate node. + SDValue New = TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, + Op.getOperand(0), + TLO.DAG.getConstant(NewImm, VT)); + return TLO.CombineTo(Op, New); + } + + return false; +} + /// computeKnownBitsForTargetNode - Determine which of the bits specified in /// Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. Index: test/CodeGen/AArch64/optimize-imm.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/optimize-imm.ll @@ -0,0 +1,64 @@ +; RUN: llc -o - %s -march=aarch64 | FileCheck %s + +; CHECK-LABEL: _and1: +; CHECK: and {{w[0-9]+}}, w0, #0xfffffffd + +define void @and1(i32 %a, i8* nocapture %p) { +entry: + %and = and i32 %a, 253 + %conv = trunc i32 %and to i8 + store i8 %conv, i8* %p, align 1 + ret void +} + +; (a & 0x3dfd) | 0xffffc000 +; +; CHECK-LABEL: _and2: +; CHECK: and {{w[0-9]+}}, w0, #0xfdfdfdfd + +define i32 @and2(i32 %a) { +entry: + %and = and i32 %a, 15869 + %or = or i32 %and, -16384 + ret i32 %or +} + +; (a & 0x19) | 0xffffffc0 +; +; CHECK-LABEL: _and3: +; CHECK: and {{w[0-9]+}}, w0, #0x99999999 + +define i32 @and3(i32 %a) { +entry: + %and = and i32 %a, 25 + %or = or i32 %and, -64 + ret i32 %or +} + +; (a & 0xc5600) | 0xfff1f1ff +; +; CHECK-LABEL: _and4: +; CHECK: and {{w[0-9]+}}, w0, #0xfffc07ff + +define i32 @and4(i32 %a) { +entry: + %and = and i32 %a, 787968 + %or = or i32 %and, -921089 + ret i32 %or +} + +; Make sure we don't shrink or optimize an XOR's immediate operand if the +; immediate is -1. Instruction selection turns (and ((xor $mask, -1), $v0)) into +; a BIC. + +; CHECK-LABEL: _xor1: +; CHECK: orr [[R0:w[0-9]+]], wzr, #0x38 +; CHECK: bic {{w[0-9]+}}, [[R0]], w0, lsl #3 + +define i32 @xor1(i32 %a) { +entry: + %shl = shl i32 %a, 3 + %xor = and i32 %shl, 56 + %and = xor i32 %xor, 56 + ret i32 %and +}