Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -141,6 +141,18 @@ cl::desc( "Enable merging extends and rounds into FCOPYSIGN on vector types")); +static cl::opt SelectSeq( + "combiner-select-seq", cl::Hidden, cl::init(false), + cl::desc("DAG combiner minimize condition-code lifetime for select " + "sequences over constants in arithmetic progressions"), + cl::ZeroOrMore); + +static cl::opt SelectSeqMinCostBenefit( + "combiner-select-seq-min-cost-benefit", cl::Hidden, cl::init(1), + cl::desc("Transform only when the cost benefit (instruction count to be " + "reduced by) is at-least as specified; default: 1)"), + cl::ZeroOrMore); + namespace { class DAGCombiner { @@ -804,6 +816,10 @@ void ExtendSetCCUses(const SmallVectorImpl &SetCCs, SDValue OrigLoad, SDValue ExtLoad, ISD::NodeType ExtType); + + /// Rewrite sequences of selects minimizing dependency on the condition code + /// register when possible. + bool SelectSeqMinCCLifetime(void); }; /// This class is a DAGUpdateListener that removes any deleted @@ -1568,11 +1584,571 @@ return true; } +//////////////////////////////////////////////////////////////////////////////// +// +// --combiner-select-seq: +// ====================== +// +// Summary: +// -------- +// +// We deal with the special case of a sequence of select instructions that use +// the condition code register to decide between corresponding elements of two +// sequences of constants that are each, in arithmetic progression. Typically, +// the arguments to the selects would form a sequence of accesses to fields of +// a struct or the elements of an array. Instead of having the CC tied up across +// the expanse of the sequence for the pointwise selection of the constants, we +// transform the code to instead, decide upfront between the defining constructs +// of the two sequences of constants and generating the selected sequence +// elements via a corresponding sequence of add instructions. +// +// Introduction: +// ------------- +// +// Consider a sequence of selects "S", consuming a setcc such as: +// +// cond = setcc ... +// ... +// S[0]: reg[0] = select cond t[0] f[0] +// ... +// S[n-1]: reg[n-1] = select cond t[n-1] f[n-1] +// ... +// +// Two sequences arise from the operands of each select: +// +// t[0], t[1], ..., t[n-1] +// f[0], f[1], ..., f[n-1] +// +// Denote either sequence by (say) "X". (PS: Read ':=' as "is of the form") +// +// If, +// +// X[i] := offset[i] (forall i : i <- [0, n)) +// +// or, +// +// X[i] := reg[i] (forall i : i <- [0, n)) +// where: +// reg[0] := base-register +// | add base-register, offset[0] +// reg[i] := add base-register, offset[i] (forall i : i <- (0, n)) +// where: +// base-register := BaseRegDef <>, Register:(i32|i64) %<> +// +// where: +// offset[] := Constant:i64<> +// | Constant:i32<> +// BaseRegDef := CopyFromReg | Load +// +// Define: +// +// Offset(X)[0] = Constant:(|)<0> +// if X[0] := (reg[0] := base-register) +// Offset(X)[i] = offset[i] +// if X[i] := offset[i] (forall i : i <- [0, n)) +// Offset(X)[i] = offset[i] +// if X[i] := add base-register, offset[i] (forall i : i <- [0, n)) +// +// Now, (for: n > 1) if Offset(X) is an arithmetic progression, i.e: +// +// Offset(X)[i] := InitialValue(X) (if: i == 0) +// | Offset(X)[i-1] + Delta(X) (forall i : i <- (0, n)) +// where: +// InitialValue(X) = k (k is an arbitrary integer constant) +// Delta(X) = (Offset(X)[1] - Offset(X)[0]) +// +// Further define: +// BaseReg(X) = Constant:(|)<0> +// if X[0] := offset[0] +// BaseReg(X) = base-register +// if X[0] := base-register +// | add base-register, offset[0] +// +// PS: In practice, we also accept the reverse-form: +// +// reg[n-1] := base-register +// | add base-register, offset[n-1] +// reg[i] := add base-register, offset[i] (forall i : i <- [0, n)) +// where: +// base-register := BaseRegDef <>, Register:(i32<>|i64) %<> +// +// However, the essential idea remains as above. +// +// Proposition: +// ------------ +// +// Then, the sequence of selects "S", can be rewritten as sequence "S'", where +// a choice is made between the two sequences via selects on the base-register, +// the initial-value and the constant-difference (delta) between the successive +// elements - i.e. the constructs that define such a sequence, and then these +// are used to generate each access via a sequence of adds: +// +// cond = setcc ... +// # Insert: 0 +// BaseReg(S) = select cond BaseReg(t) BaseReg(f) +// # Insert: 1 +// InitialValue(S) = select cond InitialValue(t) InitialValue(f) +// # Insert: 2 +// Delta(S) = select cond Delta(t) Delta(f) +// ... +// # Rewrite: 0 +// S'[0]: reg[0] = add BaseReg(S) InitialValue(S) +// ... +// # Rewrite: i +// S'[i]: reg[i] = add reg[i-1] Delta(S) +// ... +// # Rewrite: n-1 +// S'[n-1]: reg[n-1] = add reg[n-2] Delta(S) +// ... +// +// Conclusion: +// ----------- +// +// The above code transformation has two effects: +// +// a. Minimization of the setcc lifetime. +// +// The Rewrites: [0, n) do not have a dependency on the setcc. This is the +// primary motivation for performing this transformation; see: +// +// [AArch64] Extremely slow code generation for series of function +// calls/addition #50491 +// a.k.a: https://bugs.llvm.org/show_bug.cgi?id=51147 +// +// As the length of "S" grows, the lifetime of the setcc grows, creating an +// interference with all the other computations in the DAG, thus causing +// long build times. Ths transformation helps keep the setcc lifetime minimal +// in this case. +// +// b. Code size reduction. +// +// Also, while we have (upto) three new selects (Inserts [0-2]), we +// eliminate one select and (upto) two adds with each rewrite [0, n) (which +// brings in an add) therefore, reducing overall code size, and potentially +// improving the runtime performance of the code. +// +// NewCodeSize = OldCodeSize - (3 * n) + (3 + n) (Best case). +// +// Notes: +// ------ +// +// As a future extension, extend the transformation to include sequences not +// in arithmetic progression by creating two lookup tables for storing the +// constant offsets for the [tf]-sequences, and selecting the appropriate +// table once, based on the setcc value. Then, the offset to add to the base +// register can be looked up in its entry in the appropriate table; the idea +// being similar to the scheme above. +// +//////////////////////////////////////////////////////////////////////////////// +bool DAGCombiner::SelectSeqMinCCLifetime(void) { + + LLVM_DEBUG(dbgs() << "SelectSeqMinCCLifetime:\n"); + + LLVM_DEBUG(dbgs() << "DAG PRE:\n"); + for (SDNode &Node : DAG.allnodes()) + LLVM_DEBUG(dbgs() << ""; Node.dump()); + + // Run through the DAG, looking for selects that use setcc, + // collect the setcc operand if we have not collected it already: + SmallSet CandidateSetcc; + for (SDNode &Node : DAG.allnodes()) { + if (Node.getOpcode() != ISD::SELECT) + continue; + SDValue Op0 = Node.getOperand(0); + if (Op0.getOpcode() != ISD::SETCC) + continue; + if (!CandidateSetcc.contains(Op0)) + CandidateSetcc.insert(Op0); + } + + auto ProcessSetcc = [this](SDValue N) -> bool { + bool DAGModified = false; + assert(N.getOpcode() == ISD::SETCC); + LLVM_DEBUG(dbgs() << "Setcc: "; N.dump()); + // SelectUser: All the select instructions that use this setcc. + // SelectUser lists the selects in the order they appear in the DAG i.e. + // going from top to down in the DAG, they appear in left to right order, 0 + // onwards ... Ultimately, the accesses are generated using SelectUser. + SmallVector SelectUser; + SmallVector TSeq; // Operand 1 of each SelectUser + SmallVector FSeq; // Operand 2 of each SelectUser + SDNodeFlags AccessAddrFlags; + unsigned i = 0; + // Collect the 't' & 'f' operands: + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; + ++UI, ++i) { + // NOTE: The SDNode::use_iterator presents the use's in *reverse* order. + SDNode *User = *UI; + if (User->getOpcode() != ISD::SELECT) + continue; + SDValue N1 = User->getOperand(1); + SDValue N2 = User->getOperand(2); + auto itU = SelectUser.begin(); + SelectUser.insert(itU, User); + auto itO1 = TSeq.begin(); + TSeq.insert(itO1, N1); + auto itO2 = FSeq.begin(); + FSeq.insert(itO2, N2); + SDNodeFlags N1Flags = N1->getFlags(); + SDNodeFlags N2Flags = N2->getFlags(); + // Note the flags for later use ... + if (N1.getOpcode() == ISD::ADD) + AccessAddrFlags = N1Flags; + if (N2.getOpcode() == ISD::ADD) + AccessAddrFlags = N2Flags; + if (N1.getOpcode() != ISD::ADD || N2.getOpcode() != ISD::ADD) + continue; + // Both operands are ISD::ADD's Ascertain that either: Both operands have + // the same (NoSigned/NoUnsigned)Wrap flags or both operands have no + // (NoSigned/NoUnsigned)Wrap flags: + if (!((N1Flags.hasNoSignedWrap() && N2Flags.hasNoSignedWrap()) || + (N1Flags.hasNoUnsignedWrap() && N2Flags.hasNoUnsignedWrap()) || + ((!N1Flags.hasNoSignedWrap() && !N2Flags.hasNoSignedWrap()) && + (!N1Flags.hasNoUnsignedWrap() && !N2Flags.hasNoUnsignedWrap())))) { + LLVM_DEBUG( + dbgs() << "Operand (Signed/Unsigned)flags mismatch; skip.\n"); + return false; + } + } + LLVM_DEBUG(dbgs() << "User:\n"); + for (unsigned i = 0; i < SelectUser.size(); ++i) { + LLVM_DEBUG(dbgs() << i << " "; SelectUser[i]->dump()); + for (const SDValue &Op : SelectUser[i]->op_values()) + LLVM_DEBUG(dbgs() << " "; Op.dump()); + } + auto WellFormedAP = [this](const SmallVector &OpSeq, + SDValue &BaseReg, int64_t &InitialVal, + int64_t &Delta, uint64_t &ADDCount) -> bool { + auto AscertainConstDiff = [](const SmallVector &RawOffsets, + int64_t diff) -> bool { + bool WF = true; + for (unsigned i = 1; WF && (i < RawOffsets.size()); ++i) + WF &= (diff == (RawOffsets[i] - RawOffsets[i - 1])); + return WF; + }; + if (OpSeq.size() < 2) { + LLVM_DEBUG(dbgs() << "Need at least two elements in an arithmetic " + "progression; skip.\n"); + return false; + } + bool WF = true; + SmallVector RawOffsets; + if (OpSeq[0].getOpcode() == ISD::Constant) { + for (auto V : OpSeq) { + WF &= ((V.getOpcode() == ISD::Constant)); + if (WF) { + auto *C = dyn_cast(V); + if (C && C->getAPIntValue().getMinSignedBits() <= 64) + RawOffsets.push_back(C->getSExtValue()); + else { + LLVM_DEBUG(dbgs() << "Unable to obtain value; skip.\n"); + return false; + } + } else { + LLVM_DEBUG(dbgs() + << "If the zeroeth element is a constant, so must be " + "each element in the sequence; skip.\n"); + return false; + } + } + BaseReg = DAG.getConstant(0, SDLoc(OpSeq[0]), OpSeq[0].getValueType()); + auto *C0 = dyn_cast(OpSeq[0]); + auto *C1 = dyn_cast(OpSeq[1]); + if ((C0 && C0->getAPIntValue().getMinSignedBits() <= 64) && + (C1 && C1->getAPIntValue().getMinSignedBits() <= 64)) { + InitialVal = C0->getSExtValue(); + Delta = (C1->getSExtValue()) - InitialVal; + WF &= AscertainConstDiff(RawOffsets, Delta); + return WF; + } else { + LLVM_DEBUG(dbgs() << "Unable to obtain value; skip.\n"); + return false; + } + } else { + // Three cases arise: + // 0. All the elements are ISD::ADD's + // 1. The zeroeth element is a base register definition and the rest are + // ISD::ADD's + // 2. The OpSeq.size()-1'th element is a base register definition and + // the rest are ISD::ADD's + // l: inclusive lower bound + // u: non-inclusive upper bound + auto isBaseRegDef = [](const SDValue V) -> bool { + return (V.getOpcode() == ISD::CopyFromReg) || + (V.getOpcode() == ISD::LOAD); + }; + unsigned l, u; + if (OpSeq[0].getOpcode() == ISD::ADD && + OpSeq[OpSeq.size() - 1].getOpcode() == ISD::ADD) { + l = 0; + u = OpSeq.size(); + if (isBaseRegDef(OpSeq[0].getOperand(0))) { + BaseReg = OpSeq[0].getOperand(0); + } else { + LLVM_DEBUG(dbgs() << "Unable to get BaseReg; skip.\n"); + return false; + } + } else if (isBaseRegDef(OpSeq[0])) { + l = 1; + u = OpSeq.size(); + BaseReg = OpSeq[0]; + } else if (isBaseRegDef(OpSeq[OpSeq.size() - 1])) { + l = 0; + u = OpSeq.size() - 1; + BaseReg = OpSeq[OpSeq.size() - 1]; + } else { + LLVM_DEBUG( + dbgs() + << "Sequence not in (Add)+|(BaseRegDef)(Add)+|(Add)+(BaseRegDef) " + "form; skip.\n"); + return false; + } + // Ascertain that the elements in OpSeq[l, u) are all ISD::ADD's in the + // form: BaseReg + (constant offset) + for (unsigned i = l; WF && (i < u); ++i) { + WF &= ((OpSeq[i].getOpcode() == ISD::ADD) && + (OpSeq[i].getOperand(0) == BaseReg) && + (OpSeq[i].getOperand(1).getOpcode() == ISD::Constant)); + if (WF) { + ++ADDCount; + auto *C = dyn_cast(OpSeq[i].getOperand(1)); + if (C && C->getAPIntValue().getMinSignedBits() <= 64) + RawOffsets.push_back(C->getSExtValue()); + else { + LLVM_DEBUG(dbgs() << "Unable to obtain value; skip.\n"); + return false; + } + } else { + LLVM_DEBUG(dbgs() << "Sequence not in (Add = BaseReg + )+ form; skip.\n"); + return false; + } + } + if (!WF) + return WF; + if (isBaseRegDef(OpSeq[0])) { + InitialVal = 0; + // Element 1 is guaranteed to be ISD::ADD in the form: + // BaseReg + (constant offset) + auto *C = dyn_cast(OpSeq[1].getOperand(1)); + if (C && C->getAPIntValue().getMinSignedBits() <= 64) + Delta = (C->getSExtValue()) - InitialVal; + else { + LLVM_DEBUG(dbgs() << "Unable to obtain value; skip.\n"); + return false; + } + } else { // Element 0 is an ISD::ADD + auto *C = dyn_cast(OpSeq[0].getOperand(1)); + if (C && C->getAPIntValue().getMinSignedBits() <= 64) { + InitialVal = C->getSExtValue(); + if (OpSeq[1].getOpcode() == ISD::ADD) { + auto *C = dyn_cast(OpSeq[1].getOperand(1)); + if (C && C->getAPIntValue().getMinSignedBits() <= 64) { + Delta = (C->getSExtValue()) - InitialVal; + } else { + LLVM_DEBUG(dbgs() << "Unable to obtain value; skip.\n"); + return false; + } + } else { // Element 1 is an BaseRegDef + Delta = 0 - InitialVal; + } + } else { + LLVM_DEBUG(dbgs() << "Unable to obtain value; skip.\n"); + return false; + } + } + WF &= AscertainConstDiff(RawOffsets, Delta); + return WF; + } + }; + uint64_t TSeqADDCount = 0; + SDValue TSeqBaseReg; + int64_t TSeqInitialVal = 0; + int64_t TSeqDelta = 0; + bool TSeqValid = WellFormedAP(TSeq, TSeqBaseReg, TSeqInitialVal, TSeqDelta, + TSeqADDCount); + uint64_t FSeqADDCount = 0; + SDValue FSeqBaseReg; + int64_t FSeqInitialVal = 0; + int64_t FSeqDelta = 0; + bool FSeqValid = WellFormedAP(FSeq, FSeqBaseReg, FSeqInitialVal, FSeqDelta, + FSeqADDCount); + if (!TSeqValid || !FSeqValid) { + LLVM_DEBUG(dbgs() << "Operands not well formed or not in aritmetic " + "progression; skip.\n"); + return false; + } + EVT TSeqEltVT = TSeqBaseReg.getValueType(); + EVT FSeqEltVT = FSeqBaseReg.getValueType(); + EVT TSeqEltSTVT = TSeqEltVT.getScalarType(); + EVT FSeqEltSTVT = FSeqEltVT.getScalarType(); + SDValue TSeqInitialValReg = DAG.getConstant( + APInt(TSeqEltSTVT.getSizeInBits(), (uint64_t)TSeqInitialVal, true), + SDLoc(N), TSeqEltVT); + SDValue TSeqDeltaReg = DAG.getConstant( + APInt(TSeqEltSTVT.getSizeInBits(), (uint64_t)TSeqDelta, true), SDLoc(N), + TSeqEltVT); + SDValue FSeqInitialValReg = DAG.getConstant( + APInt(FSeqEltSTVT.getSizeInBits(), (uint64_t)FSeqInitialVal, true), + SDLoc(N), FSeqEltVT); + SDValue FSeqDeltaReg = DAG.getConstant( + APInt(FSeqEltSTVT.getSizeInBits(), (uint64_t)FSeqDelta, true), SDLoc(N), + FSeqEltVT); + // Sequence selector nodes: + // Construct a select for the base register: + SDValue BaseRegSelect = DAG.getSelect( + SDLoc(N), + TSeqBaseReg.getValueType(), // Note: We could use FSeqBaseReg as well. + N, TSeqBaseReg, FSeqBaseReg); + LLVM_DEBUG(dbgs() << "BaseReg: "; BaseRegSelect->dump()); + // Construct a select for the initial value: + SDValue InitialValSelect = DAG.getSelect( + SDLoc(N), + TSeqInitialValReg + .getValueType(), // Note: We could use FSeqInitialValReg as well. + N, TSeqInitialValReg, FSeqInitialValReg); + LLVM_DEBUG(dbgs() << "InitialVal: "; InitialValSelect->dump()); + // Construct a select for the delta value: + SDValue DeltaSelect = + DAG.getSelect(SDLoc(N), TSeqDeltaReg.getValueType(), + // Note: We could use FSeqDeltaReg as well. + N, TSeqDeltaReg, FSeqDeltaReg); + LLVM_DEBUG(dbgs() << "Delta: "; DeltaSelect->dump()); + // Check if any of the sequence selector nodes correspond to the SelectUser + // nodes. If so, we need to bail out here itself as we will end-up creating + // a cycle in the generated access address nodes, after we substitute the + // the generated access addresss in the uses of the select nodes that we are + // eliminating. The exception to this is when the BaseReg is SelectUser[0] + // and the InitialValue is the constant 0; in this case, AccessAddr[0] will + // reduce to the BaseReg, so this is not an issue. + auto Overlap = [](const SmallVector NodeList, + const SDNode *Node) -> unsigned { + unsigned Index = 0; + for (auto N : NodeList) + if (N == Node) + return Index; + else + ++Index; + return Index; + }; + auto isZero = [](SDValue V) -> bool { + if (V.getOpcode() == ISD::Constant) { + auto *C = dyn_cast(V); + if (C && C->getAPIntValue().getMinSignedBits() <= 64) + return C->getSExtValue() == 0; + else { + LLVM_DEBUG(dbgs() << "Unable to obtain value; skip.\n"); + return false; + } + } + return false; + }; + unsigned SUSize = SelectUser.size(); + unsigned BRIndex = Overlap(SelectUser, BaseRegSelect.getNode()); + if ((((BRIndex != 0) || !isZero(InitialValSelect)) && + (BRIndex != SUSize)) || + (Overlap(SelectUser, InitialValSelect.getNode()) != SUSize) || + (Overlap(SelectUser, DeltaSelect.getNode()) != SUSize)) { + LLVM_DEBUG( + dbgs() + << "Generated code will introduce cycle(s) in the DAG; skip.\n"); + return DAGModified = false; + } + auto CostBenefit = [&SelectUser, &TSeqADDCount, &FSeqADDCount, &BRIndex, + &InitialValSelect, &isZero]() -> int64_t { + bool Adjusted = false; + // select instructions we think we are going to subtract: + uint64_t SelectInstCount = SelectUser.size(); // select's + // Instructions we think we are going to add: + uint64_t SeqSelectorCount = 3; // 1 BaseReg, 1 InitialVal, 1 Delta + uint64_t AddInstCount = SelectInstCount; // add's (AccessAddr's below) + if (BRIndex == 0 && isZero(InitialValSelect)) { + // When the BaseReg is SelectUser[0] and the InitialValue is the + // constant 0; AccessAddr[0] will reduce to the BaseReg, so, we will + // eliminate one select less and we will add one ADD less, so make the + // appropriate adjustments: + --SelectInstCount; + --AddInstCount; + // ... but, we counted 1 for the BaseReg, and 1 for the InitialValue, + // which are not newly added instructions now, so: + SeqSelectorCount -= 2; + // Lastly: + Adjusted = true; + } + uint64_t ToBeEliminatedInstCount = + SelectInstCount + TSeqADDCount + FSeqADDCount; + uint64_t ToBeAddedInstCount = AddInstCount + SeqSelectorCount; + int64_t Benefit = ToBeEliminatedInstCount - ToBeAddedInstCount; + LLVM_DEBUG(dbgs() << "Cost Benefit Analysis:\n"); + LLVM_DEBUG(dbgs() << "Adjusted: " << Adjusted << "\n"); + LLVM_DEBUG(dbgs() << "Number of select's eliminated: " << SelectInstCount + << "\n"); + LLVM_DEBUG(dbgs() << "Number of TSeq ADD's eliminated: " << TSeqADDCount + << "\n"); + LLVM_DEBUG(dbgs() << "Number of FSeq ADD's eliminated: " << FSeqADDCount + << "\n"); + LLVM_DEBUG(dbgs() << "Number of Sequence Selectors added: " + << SeqSelectorCount << "\n"); + LLVM_DEBUG(dbgs() << "Number of ADD accesses added: " << AddInstCount + << "\n"); + LLVM_DEBUG( + dbgs() << "CostBenefit (i.e. instruction count to be reduced by): " + << Benefit << "\n"); + LLVM_DEBUG(dbgs() << "SelectSeqMinCostBenefit: " + << SelectSeqMinCostBenefit << "\n"); + return Benefit; + }; + if (CostBenefit() < SelectSeqMinCostBenefit) { + LLVM_DEBUG( + dbgs() + << "Generated code will not be of cost benefit to the DAG; skip.\n"); + return DAGModified = false; + } + // Construct the Access address nodes: + SmallVector AccessAddr; + LLVM_DEBUG(dbgs() << "AccessAddr:\n"); + for (unsigned i = 0; i < SelectUser.size(); ++i) { + SDLoc DL(SelectUser[i]); + SDValue CurAccessAddr; + if (i == 0) + CurAccessAddr = DAG.getNode(ISD::ADD, DL, BaseRegSelect.getValueType(), + BaseRegSelect, InitialValSelect); + else + CurAccessAddr = + DAG.getNode(ISD::ADD, DL, AccessAddr[i - 1].getValueType(), + AccessAddr[i - 1], DeltaSelect); + // DAG.getNode() could have optimized and returned an ISD::SELECT if the + // InitialValSelect reduced to the constant zero, so check that we have an + // ISD:ADD before setting the flags: + if (CurAccessAddr.getOpcode() == ISD::ADD) + CurAccessAddr->setFlags(AccessAddrFlags); + AccessAddr.push_back(CurAccessAddr); + DAG.ReplaceAllUsesWith(SelectUser[i], CurAccessAddr.getNode()); + DAGModified = true; + LLVM_DEBUG(dbgs() << i << " "; CurAccessAddr->dump()); + } + DAG.RemoveDeadNodes(); + return DAGModified; + }; + + bool DAGModified = false; + for (auto Setcc : CandidateSetcc) + DAGModified |= ProcessSetcc(Setcc); + + LLVM_DEBUG(dbgs() << "DAG PST:\n"); + for (SDNode &Node : DAG.allnodes()) + LLVM_DEBUG(dbgs() << ""; Node.dump()); + + return DAGModified; +} + //===----------------------------------------------------------------------===// // Main DAG Combiner implementation //===----------------------------------------------------------------------===// void DAGCombiner::Run(CombineLevel AtLevel) { + if (SelectSeq && (AtLevel == BeforeLegalizeTypes)) + SelectSeqMinCCLifetime(); // set the instance variables, so that the various visit routines may use it. Level = AtLevel; LegalDAG = Level >= AfterLegalizeDAG; Index: llvm/test/CodeGen/AArch64/combiner-select-seq.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/combiner-select-seq.ll @@ -0,0 +1,240 @@ +; RUN: llc < %s -march=aarch64 -simplify-mir -stop-after=amdgpu-isel | FileCheck %s --check-prefix=PRE +; RUN: llc < %s -march=aarch64 -simplify-mir -stop-after=amdgpu-isel --combiner-select-seq -combiner-select-seq-min-cost-benefit=5 | FileCheck %s --check-prefix=PST + +; Description: +; ------------ +; Given: +; struct A { +; char junk[53]; +; int a0; +; int a1; +; int a2; +; int a3; +; int a4; +; int a5; +; int a6; +; int a7; +; }; +; extern int sum(struct A *pa); +; extern int access(void *p); +; int sum(struct A *pa) { +; int s = 0; +; s += access(pa ? (void *) &pa->a0 : (void *) 91); +; s += access(pa ? (void *) &pa->a1 : (void *) 81); +; s += access(pa ? (void *) &pa->a2 : (void *) 71); +; s += access(pa ? (void *) &pa->a3 : (void *) 61); +; s += access(pa ? (void *) &pa->a4 : (void *) 51); +; s += access(pa ? (void *) &pa->a5 : (void *) 41); +; s += access(pa ? (void *) &pa->a6 : (void *) 31); +; s += access(pa ? (void *) &pa->a7 : (void *) 21); +; return s; +; } +; Compiled into LLVM IR, thus: -O3 --target=aarch64 -emit-llvm -S +; Do we, under --combiner-select-seq, identify the setcc: +; Setcc: t5: i1 = setcc t2, Constant:i64<0>, seteq:ch +; along with its select users: +; User: +; 0 t9: i64 = select t5, Constant:i64<91>, t7 +; t5: i1 = setcc t2, Constant:i64<0>, seteq:ch +; t8: i64 = Constant<91> +; t7: i64 = add nuw t2, Constant:i64<56> +; 1 t26: i64 = select t5, Constant:i64<81>, t24 +; t5: i1 = setcc t2, Constant:i64<0>, seteq:ch +; t25: i64 = Constant<81> +; t24: i64 = add nuw t2, Constant:i64<60> +; 2 t37: i64 = select t5, Constant:i64<71>, t35 +; t5: i1 = setcc t2, Constant:i64<0>, seteq:ch +; t36: i64 = Constant<71> +; t35: i64 = add nuw t2, Constant:i64<64> +; 3 t48: i64 = select t5, Constant:i64<61>, t46 +; t5: i1 = setcc t2, Constant:i64<0>, seteq:ch +; t47: i64 = Constant<61> +; t46: i64 = add nuw t2, Constant:i64<68> +; 4 t59: i64 = select t5, Constant:i64<51>, t57 +; t5: i1 = setcc t2, Constant:i64<0>, seteq:ch +; t58: i64 = Constant<51> +; t57: i64 = add nuw t2, Constant:i64<72> +; 5 t70: i64 = select t5, Constant:i64<41>, t68 +; t5: i1 = setcc t2, Constant:i64<0>, seteq:ch +; t69: i64 = Constant<41> +; t68: i64 = add nuw t2, Constant:i64<76> +; 6 t81: i64 = select t5, Constant:i64<31>, t79 +; t5: i1 = setcc t2, Constant:i64<0>, seteq:ch +; t80: i64 = Constant<31> +; t79: i64 = add nuw t2, Constant:i64<80> +; 7 t92: i64 = select t5, Constant:i64<21>, t90 +; t5: i1 = setcc t2, Constant:i64<0>, seteq:ch +; t91: i64 = Constant<21> +; t90: i64 = add nuw t2, Constant:i64<84> +; defining the two sequences: +; t-seq: ( 0 + 91), ( 0 + 81), ..., (0 + 31), (0 + 21) +; f-seq: (t2 + 56), (t2 + 60), ..., (t2 + 80), (t2 + 84) +; and derive and inject the sequence selectors: +; BaseReg: t104: i64 = select t5, Constant:i64<0>, t2 +; InitialVal: t105: i64 = select t5, Constant:i64<91>, Constant:i64<56> +; Delta: t106: i64 = select t5, Constant:i64<-10>, Constant:i64<4> +; that define: +; AccessAddr: +; 0 t107: i64 = add nuw t104, t105 +; 1 t108: i64 = add nuw t107, t106 +; 2 t109: i64 = add nuw t108, t106 +; 3 t110: i64 = add nuw t109, t106 +; 4 t111: i64 = add nuw t110, t106 +; 5 t112: i64 = add nuw t111, t106 +; 6 t113: i64 = add nuw t112, t106 +; 7 t114: i64 = add nuw t113, t106 +; and, do we rewrite the DAG, eliminating the selects and the base-displacement +; add's so that instead, the AccessAddr's are now used? + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +%struct.A = type { [53 x i8], i32, i32, i32, i32, i32, i32, i32, i32 } + +define dso_local i32 @sum(ptr noundef readonly %pa) local_unnamed_addr { +entry: + %tobool.not = icmp eq ptr %pa, null + %a0 = getelementptr inbounds %struct.A, ptr %pa, i64 0, i32 1 + %cond = select i1 %tobool.not, ptr inttoptr (i64 91 to ptr), ptr %a0 + %call = tail call i32 @access(ptr noundef nonnull %cond) + %a1 = getelementptr inbounds %struct.A, ptr %pa, i64 0, i32 2 + %cond5 = select i1 %tobool.not, ptr inttoptr (i64 81 to ptr), ptr %a1 + %call6 = tail call i32 @access(ptr noundef nonnull %cond5) + %add7 = add nsw i32 %call6, %call + %a2 = getelementptr inbounds %struct.A, ptr %pa, i64 0, i32 3 + %cond12 = select i1 %tobool.not, ptr inttoptr (i64 71 to ptr), ptr %a2 + %call13 = tail call i32 @access(ptr noundef nonnull %cond12) + %add14 = add nsw i32 %add7, %call13 + %a3 = getelementptr inbounds %struct.A, ptr %pa, i64 0, i32 4 + %cond19 = select i1 %tobool.not, ptr inttoptr (i64 61 to ptr), ptr %a3 + %call20 = tail call i32 @access(ptr noundef nonnull %cond19) + %add21 = add nsw i32 %add14, %call20 + %a4 = getelementptr inbounds %struct.A, ptr %pa, i64 0, i32 5 + %cond26 = select i1 %tobool.not, ptr inttoptr (i64 51 to ptr), ptr %a4 + %call27 = tail call i32 @access(ptr noundef nonnull %cond26) + %add28 = add nsw i32 %add21, %call27 + %a5 = getelementptr inbounds %struct.A, ptr %pa, i64 0, i32 6 + %cond33 = select i1 %tobool.not, ptr inttoptr (i64 41 to ptr), ptr %a5 + %call34 = tail call i32 @access(ptr noundef nonnull %cond33) + %add35 = add nsw i32 %add28, %call34 + %a6 = getelementptr inbounds %struct.A, ptr %pa, i64 0, i32 7 + %cond40 = select i1 %tobool.not, ptr inttoptr (i64 31 to ptr), ptr %a6 + %call41 = tail call i32 @access(ptr noundef nonnull %cond40) + %add42 = add nsw i32 %add35, %call41 + %a7 = getelementptr inbounds %struct.A, ptr %pa, i64 0, i32 8 + %cond47 = select i1 %tobool.not, ptr inttoptr (i64 21 to ptr), ptr %a7 + %call48 = tail call i32 @access(ptr noundef nonnull %cond47) + %add49 = add nsw i32 %add42, %call48 + ret i32 %add49 +} + +declare dso_local noundef i32 @access(ptr nocapture noundef readonly) + +; PRE-LABEL: name: sum +; PRE: [[R0:%[0-9]+]]:gpr64common = COPY $x0 +; PRE-DAG: [[R1:%[0-9]+]]:gpr64common = nuw ADDXri [[R0]], 56, 0 +; PRE-DAG: [[R2:%[0-9]+]]:gpr64 = SUBSXri [[R0]], 0, 0, implicit-def $nzcv +; PRE-DAG: [[R3:%[0-9]+]]:gpr32 = MOVi32imm 91 +; PRE: [[R4:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[R3]], %subreg.sub_32 +; PRE-DAG: [[R5:%[0-9]+]]:gpr64 = CSELXr killed [[R4]], killed [[R1]], 0, implicit $nzcv +; PRE-DAG: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp +; PRE-DAG: [[R6:%[0-9]+]]:gpr64common = nuw ADDXri [[R0]], 60, 0 +; PRE-DAG: [[R7:%[0-9]+]]:gpr32 = MOVi32imm 81 +; PRE: [[R8:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[R7]], %subreg.sub_32 +; PRE-DAG: [[R9:%[0-9]+]]:gpr64 = CSELXr killed [[R8]], killed [[R6]], 0, implicit $nzcv +; PRE-DAG: [[R10:%[0-9]+]]:gpr64common = nuw ADDXri [[R0]], 64, 0 +; PRE-DAG: [[R11:%[0-9]+]]:gpr32 = MOVi32imm 71 +; PRE: [[R12:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[R11]], %subreg.sub_32 +; PRE-DAG: [[R13:%[0-9]+]]:gpr64 = CSELXr killed [[R12]], killed [[R10]], 0, implicit $nzcv +; PRE-DAG: [[R14:%[0-9]+]]:gpr64common = nuw ADDXri [[R0]], 68, 0 +; PRE-DAG: [[R15:%[0-9]+]]:gpr32 = MOVi32imm 61 +; PRE: [[R16:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[R15]], %subreg.sub_32 +; PRE-DAG: [[R17:%[0-9]+]]:gpr64 = CSELXr killed [[R16]], killed [[R14]], 0, implicit $nzcv +; PRE-DAG: [[R18:%[0-9]+]]:gpr64common = nuw ADDXri [[R0]], 72, 0 +; PRE-DAG: [[R19:%[0-9]+]]:gpr32 = MOVi32imm 51 +; PRE: [[R20:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[R19]], %subreg.sub_32 +; PRE-DAG: [[R21:%[0-9]+]]:gpr64 = CSELXr killed [[R20]], killed [[R18]], 0, implicit $nzcv +; PRE-DAG: [[R22:%[0-9]+]]:gpr64common = nuw ADDXri [[R0]], 76, 0 +; PRE-DAG: [[R23:%[0-9]+]]:gpr32 = MOVi32imm 41 +; PRE: [[R24:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[R23]], %subreg.sub_32 +; PRE-DAG: [[R25:%[0-9]+]]:gpr64 = CSELXr killed [[R24]], killed [[R22]], 0, implicit $nzcv +; PRE-DAG: [[R26:%[0-9]+]]:gpr64common = nuw ADDXri [[R0]], 80, 0 +; PRE-DAG: [[R27:%[0-9]+]]:gpr32 = MOVi32imm 31 +; PRE: [[R28:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[R27]], %subreg.sub_32 +; PRE-DAG: [[R29:%[0-9]+]]:gpr64 = CSELXr killed [[R28]], killed [[R26]], 0, implicit $nzcv +; PRE-DAG: [[R30:%[0-9]+]]:gpr64common = nuw ADDXri [[R0]], 84, 0 +; PRE-DAG: [[R31:%[0-9]+]]:gpr32 = MOVi32imm 21 +; PRE: [[R32:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[R31]], %subreg.sub_32 +; PRE: [[R33:%[0-9]+]]:gpr64 = CSELXr killed [[R32]], killed [[R30]], 0, implicit $nzcv + +; PST-LABEL: name: sum +; PST: [[R0:%[0-9]+]]:gpr64common = COPY $x0 +; PST-DAG: [[R1:%[0-9]+]]:gpr64 = SUBSXri [[R0]], 0, 0, implicit-def $nzcv +; PST-DAG: [[R2:%[0-9]+]]:gpr32 = MOVi32imm 4 +; PST: [[R3:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[R2]], %subreg.sub_32 +; PST: [[R4:%[0-9]+]]:gpr64 = MOVi64imm -10 +; PST-DAG: [[R5:%[0-9]+]]:gpr64 = CSELXr killed [[R4]], killed [[R3]], 0, implicit $nzcv +; PST-DAG: [[R6:%[0-9]+]]:gpr32 = MOVi32imm 56 +; PST: [[R7:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[R6]], %subreg.sub_32 +; PST: [[R8:%[0-9]+]]:gpr32 = MOVi32imm 91 +; PST: [[R9:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[R8]], %subreg.sub_32 +; PST-DAG: [[R10:%[0-9]+]]:gpr64 = CSELXr killed [[R9]], killed [[R7]], 0, implicit $nzcv +; PST-DAG: [[R11:%[0-9]+]]:gpr64 = COPY $xzr +; PST: [[R12:%[0-9]+]]:gpr64 = CSELXr [[R11]], [[R0]], 0, implicit $nzcv +; PST: [[R13:%[0-9]+]]:gpr64 = nuw ADDXrr killed [[R12]], killed [[R10]] +; PST: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp +; PST: $x0 = COPY [[R13]] +; PST: BL @access, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 +; PST: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp +; PST-DAG: [[R14:%[0-9]+]]:gpr32 = COPY $w0 +; PST-DAG: [[R15:%[0-9]+]]:gpr64 = nuw ADDXrr [[R13]], [[R5]] +; PST: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp +; PST: $x0 = COPY [[R15]] +; PST: BL @access, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 +; PST: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp +; PST: [[R16:%[0-9]+]]:gpr32 = COPY $w0 +; PST-DAG: [[R17:%[0-9]+]]:gpr32 = nsw ADDWrr [[R16]], [[R14]] +; PST-DAG: [[R18:%[0-9]+]]:gpr64 = nuw ADDXrr [[R15]], [[R5]] +; PST: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp +; PST: $x0 = COPY [[R18]] +; PST: BL @access, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 +; PST: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp +; PST: [[R19:%[0-9]+]]:gpr32 = COPY $w0 +; PST-DAG: [[R20:%[0-9]+]]:gpr32 = nsw ADDWrr killed [[R17]], [[R19]] +; PST-DAG: [[R21:%[0-9]+]]:gpr64 = nuw ADDXrr [[R18]], [[R5]] +; PST: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp +; PST: $x0 = COPY [[R21]] +; PST: BL @access, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 +; PST: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp +; PST: [[R22:%[0-9]+]]:gpr32 = COPY $w0 +; PST-DAG: [[R23:%[0-9]+]]:gpr32 = nsw ADDWrr killed [[R20]], [[R22]] +; PST-DAG: [[R24:%[0-9]+]]:gpr64 = nuw ADDXrr [[R21]], [[R5]] +; PST: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp +; PST: $x0 = COPY [[R24]] +; PST: BL @access, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 +; PST: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp +; PST: [[R25:%[0-9]+]]:gpr32 = COPY $w0 +; PST-DAG: [[R26:%[0-9]+]]:gpr32 = nsw ADDWrr killed [[R23]], [[R25]] +; PST-DAG: [[R27:%[0-9]+]]:gpr64 = nuw ADDXrr [[R24]], [[R5]] +; PST: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp +; PST: $x0 = COPY [[R27]] +; PST: BL @access, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 +; PST: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp +; PST: [[R28:%[0-9]+]]:gpr32 = COPY $w0 +; PST-DAG: [[R29:%[0-9]+]]:gpr32 = nsw ADDWrr killed [[R26]], [[R28]] +; PST-DAG: [[R30:%[0-9]+]]:gpr64 = nuw ADDXrr [[R27]], [[R5]] +; PST: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp +; PST: $x0 = COPY [[R30]] +; PST: BL @access, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 +; PST: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp +; PST: [[R31:%[0-9]+]]:gpr32 = COPY $w0 +; PST-DAG: [[R32:%[0-9]+]]:gpr32 = nsw ADDWrr killed [[R29]], [[R31]] +; PST-DAG: [[R33:%[0-9]+]]:gpr64 = nuw ADDXrr [[R30]], [[R5]] +; PST: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp +; PST: $x0 = COPY [[R33]] +; PST: BL @access, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 +; PST: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp +; PST: [[R34:%[0-9]+]]:gpr32 = COPY $w0 +; PST: [[R35:%[0-9]+]]:gpr32 = nsw ADDWrr killed [[R32]], [[R34]] +; PST: $w0 = COPY [[R35]] +; PST: RET_ReallyLR implicit $w0