Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -492,6 +492,7 @@ /// Bottom Up SLP Vectorizer. class BoUpSLP { struct TreeEntry; + struct ScheduleData; public: using ValueList = SmallVector; @@ -1410,25 +1411,32 @@ public: /// Set this bundle's \p OpIdx'th operand to \p OpVL. - void setOperand(unsigned OpIdx, ArrayRef OpVL, - ArrayRef ReuseShuffleIndices) { + void setOperand(unsigned OpIdx, ArrayRef OpVL) { if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); assert(Operands[OpIdx].size() == 0 && "Already resized?"); Operands[OpIdx].resize(Scalars.size()); for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) - Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty()) - ? OpVL[ReuseShuffleIndices[Lane]] - : OpVL[Lane]; - } - - /// If there is a user TreeEntry, then set its operand. - void trySetUserTEOperand(const EdgeInfo &UserTreeIdx, - ArrayRef OpVL, - ArrayRef ReuseShuffleIndices) { - if (UserTreeIdx.UserTE) - UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL, - ReuseShuffleIndices); + Operands[OpIdx][Lane] = OpVL[Lane]; + } + + /// Set the operands of this bundle in their original order. + void setOperandsInOrder() { + assert(Operands.empty() && "Already initialized?"); + Instruction *I0 = dyn_cast(Scalars[0]); + assert(I0 && "Expected instruction."); + Operands.resize(I0->getNumOperands()); + unsigned NumLanes = Scalars.size(); + for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); + OpIdx != NumOperands; ++OpIdx) { + Operands[OpIdx].resize(NumLanes); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + Instruction *I = dyn_cast(Scalars[Lane]); + assert(I && I->getNumOperands() == NumOperands && + "Expected same number of operands"); + Operands[OpIdx][Lane] = I->getOperand(OpIdx); + } + } } /// \returns the \p OpIdx operand of this TreeEntry. @@ -1437,6 +1445,9 @@ return Operands[OpIdx]; } + /// \returns the number of operands. + unsigned getNumOperands() const { return Operands.size(); } + /// \return the single \p OpIdx operand. Value *getSingleOperand(unsigned OpIdx) const { assert(OpIdx < Operands.size() && "Off bounds"); @@ -1485,6 +1496,7 @@ /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized, const EdgeInfo &UserTreeIdx, + ScheduleData *SchedBundle, ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { VectorizableTree.push_back(llvm::make_unique(VectorizableTree)); @@ -1500,6 +1512,16 @@ assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); ScalarToTreeEntry[VL[i]] = Last->Idx; } + // Update the scheduler bundle to point to this TreeEntry. + unsigned Lane = 0; + for (ScheduleData *BundleMember = SchedBundle; BundleMember; + BundleMember = BundleMember->NextInBundle) { + BundleMember->TE = Last; + BundleMember->Lane = Lane; + ++Lane; + } + assert((!SchedBundle || Lane == VL.size()) && + "Bundle and VL out of sync"); } else { MustGather.insert(VL.begin(), VL.end()); } @@ -1507,7 +1529,6 @@ if (UserTreeIdx.UserTE) Last->UserTreeIndices.push_back(UserTreeIdx); - Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices); return Last; } @@ -1641,6 +1662,8 @@ UnscheduledDepsInBundle = UnscheduledDeps; clearDependencies(); OpValue = OpVal; + TE = nullptr; + Lane = -1; } /// Returns true if the dependency information has been calculated. @@ -1747,6 +1770,12 @@ /// Opcode of the current instruction in the schedule data. Value *OpValue = nullptr; + + /// The TreeEntry that this instruction corresponds to. + TreeEntry *TE = nullptr; + + /// The lane of this node in the TreeEntry. + int Lane = -1; }; #ifndef NDEBUG @@ -1821,10 +1850,9 @@ continue; } // Handle the def-use chain dependencies. - for (Use &U : BundleMember->Inst->operands()) { - auto *I = dyn_cast(U.get()); - if (!I) - continue; + + // Decrement the unscheduled counter and insert to ready list if ready. + auto decrUnsched = [&](Instruction *I) { doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { if (OpDef && OpDef->hasValidDependencies() && OpDef->incrementUnscheduledDeps(-1) == 0) { @@ -1839,6 +1867,26 @@ << "SLP: gets ready (def): " << *DepBundle << "\n"); } }); + }; + + // If BundleMember is a vector bundle, its operands may have been + // reordered duiring buildTree(). We therefore need to get its operands + // through the TreeEntry. + if (TreeEntry *TE = BundleMember->TE) { + int Lane = BundleMember->Lane; + assert(Lane >= 0 && "Lane not set"); + for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); + OpIdx != NumOperands; ++OpIdx) { + if (auto *I = dyn_cast(TE->getOperand(OpIdx)[Lane])) + decrUnsched(I); + } + } + // If BundleMember is a stand-alone instruction, no operand reordering + // has taken place, so we directly access its operands. + else { + for (Use &U : BundleMember->Inst->operands()) + if (auto *I = dyn_cast(U.get())) + decrUnsched(I); } // Handle the memory dependencies. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { @@ -1885,8 +1933,9 @@ /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, - const InstructionsState &S); + std::pair + tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, + const InstructionsState &S); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef VL, Value *OpValue); @@ -2214,28 +2263,28 @@ InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, nullptr); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, nullptr); return; } if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, nullptr); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, nullptr); return; } @@ -2247,7 +2296,7 @@ if (EphValues.count(VL[i])) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, nullptr); return; } } @@ -2257,7 +2306,7 @@ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, nullptr); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -2265,7 +2314,6 @@ E->UserTreeIndices.push_back(UserTreeIdx); LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n"); - E->trySetUserTEOperand(UserTreeIdx, VL, None); return; } @@ -2277,7 +2325,7 @@ if (getTreeEntry(I)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, nullptr); return; } } @@ -2288,7 +2336,7 @@ for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, nullptr); return; } } @@ -2302,7 +2350,7 @@ // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, nullptr); return; } @@ -2322,7 +2370,7 @@ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, nullptr); return; } VL = UniqueValues; @@ -2334,12 +2382,15 @@ BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this, S)) { + bool Success; + ScheduleData *Bundle; + std::tie(Success, Bundle) = BS.tryScheduleBundle(VL, this, S); + if (!Success) { LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -2360,23 +2411,28 @@ LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = + newTreeEntry(VL, true, UserTreeIdx, Bundle, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); + // Keeps the reordered operands to avoid code duplication. + SmallVector OperandsVec; for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *j : VL) Operands.push_back(cast(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - - buildTree_rec(Operands, Depth + 1, {TE, i}); + TE->setOperand(i, Operands); + OperandsVec.push_back(Operands); } + for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) + buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx}); return; } case Instruction::ExtractValue: @@ -2386,13 +2442,13 @@ if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, Bundle, ReuseShuffleIndicies); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies); + VectorizableTree.back()->setOperand(0, Op0); return; } if (!CurrentOrder.empty()) { @@ -2408,17 +2464,19 @@ auto StoredCurrentOrderAndNum = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++StoredCurrentOrderAndNum->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, Bundle, + ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies); + VectorizableTree.back()->setOperand(0, Op0); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, Bundle, + ReuseShuffleIndicies); BS.cancelScheduling(VL, VL0); return; } @@ -2434,7 +2492,7 @@ if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -2447,7 +2505,7 @@ auto *L = cast(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -2477,15 +2535,18 @@ if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, - ReuseShuffleIndicies); + auto *TE = newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + Bundle, ReuseShuffleIndicies); + TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, - ReuseShuffleIndicies, I->getFirst()); + auto *TE = + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, Bundle, + ReuseShuffleIndicies, I->getFirst()); + TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } return; @@ -2494,7 +2555,7 @@ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -2514,15 +2575,17 @@ Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = + newTreeEntry(VL, true, UserTreeIdx, Bundle, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); + TE->setOperandsInOrder(); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -2544,14 +2607,15 @@ if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = + newTreeEntry(VL, true, UserTreeIdx, Bundle, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); ValueList Left, Right; @@ -2572,7 +2636,8 @@ Right.push_back(RHS); } } - + TE->setOperand(0, Left); + TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; @@ -2597,7 +2662,8 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: { - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = + newTreeEntry(VL, true, UserTreeIdx, Bundle, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -2605,11 +2671,14 @@ if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + TE->setOperand(0, Left); + TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; } + TE->setOperandsInOrder(); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -2626,7 +2695,7 @@ if (cast(VL[j])->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); return; } } @@ -2640,7 +2709,7 @@ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); return; } } @@ -2652,13 +2721,15 @@ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = + newTreeEntry(VL, true, UserTreeIdx, Bundle, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); + TE->setOperandsInOrder(); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -2674,18 +2745,19 @@ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = + newTreeEntry(VL, true, UserTreeIdx, Bundle, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; for (Value *j : VL) Operands.push_back(cast(j)->getOperand(0)); - + TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); return; } @@ -2697,7 +2769,7 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -2713,7 +2785,7 @@ getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -2725,7 +2797,8 @@ Value *A1J = CI2->getArgOperand(j); if (ScalarArgs[j] != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument " << ScalarArgs[j] << "!=" << A1J << "\n"); @@ -2739,14 +2812,16 @@ CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = + newTreeEntry(VL, true, UserTreeIdx, Bundle, ReuseShuffleIndicies); + TE->setOperandsInOrder(); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -2763,22 +2838,26 @@ // then do not vectorize this instruction. if (!S.isAltShuffle()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + auto *TE = + newTreeEntry(VL, true, UserTreeIdx, Bundle, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + TE->setOperand(0, Left); + TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; } + TE->setOperandsInOrder(); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -2791,7 +2870,7 @@ } default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -4431,11 +4510,11 @@ // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. -bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, - BoUpSLP *SLP, - const InstructionsState &S) { +std::pair +BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, + const InstructionsState &S) { if (isa(S.OpValue)) - return true; + return {true, nullptr}; // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; @@ -4448,7 +4527,7 @@ // instructions of the bundle. for (Value *V : VL) { if (!extendSchedulingRegion(V, S)) - return false; + return {false, nullptr}; } for (Value *V : VL) { @@ -4515,9 +4594,9 @@ } if (!Bundle->isReady()) { cancelScheduling(VL, S.OpValue); - return false; + return {false, nullptr}; } - return true; + return {true, Bundle}; } void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL,