Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -516,11 +516,14 @@ /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - int getSpillCost(); + int getSpillCost(const SmallPtrSetImpl &ScalarsToVec); + + /// \returns the cost extracting vectorized elements. + int getExtractCost(const SmallPtrSetImpl &ScalarsToVec); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - int getTreeCost(); + int getTreeCost(bool ReduceTree = false); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -541,6 +544,7 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + RemovedOprations.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -600,6 +604,9 @@ /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(); + /// Reduce the cost of tree to make it patrially vectorizable, if possible. + bool reduceTree(); + OptimizationRemarkEmitter *getORE() { return ORE; } private: @@ -612,7 +619,7 @@ int getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. - void buildTree_rec(ArrayRef Roots, unsigned Depth, int); + void buildTree_rec(Value *Parent, ArrayRef Roots, unsigned Depth, int); /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a @@ -626,7 +633,7 @@ Value *vectorizeTree(TreeEntry *E); /// Vectorize a single entry in the tree, starting in \p VL. - Value *vectorizeTree(ArrayRef VL); + Value *vectorizeTree(Value *Parent, ArrayRef VL); /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. @@ -700,15 +707,26 @@ /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + + /// Cost of the tree entry. + int Cost = 0; + + /// Full cost on expanding tree at this hight. + int ExpandAtCost = 0; + + /// Parent operatrion of this operations. + Value *Parent = nullptr; }; /// Create a new VectorizableTree entry. - void newTreeEntry(ArrayRef VL, bool Vectorized, int &UserTreeIdx, + void newTreeEntry(Value *Parent, ArrayRef VL, bool Vectorized, + int &UserTreeIdx, ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; + Last->Parent = Parent; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), @@ -742,6 +760,9 @@ /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; + /// Tree entries that should not be vectorized due to throttling. + SmallVector RemovedOprations; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -1170,6 +1191,9 @@ /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS, ArrayRef Oprations); + /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); @@ -1330,7 +1354,7 @@ UserIgnoreList = UserIgnoreLst; if (!allSameType(Roots)) return; - buildTree_rec(Roots, 0, -1); + buildTree_rec(Roots[0], Roots, 0, -1); // Collect the values that we need to extract from the tree. for (TreeEntry &EIdx : VectorizableTree) { @@ -1391,35 +1415,35 @@ } } -void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, +void BoUpSLP::buildTree_rec(Value *Parent, ArrayRef VL, unsigned Depth, int UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(Parent, VL, false, UserTreeIdx); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(Parent, VL, false, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(Parent, VL, false, UserTreeIdx); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(Parent, VL, false, UserTreeIdx); return; } @@ -1431,7 +1455,7 @@ if (EphValues.count(VL[i])) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(Parent, VL, false, UserTreeIdx); return; } } @@ -1441,7 +1465,7 @@ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(Parent, VL, false, UserTreeIdx); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -1460,7 +1484,7 @@ if (getTreeEntry(I)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(Parent, VL, false, UserTreeIdx); return; } } @@ -1471,7 +1495,7 @@ for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(Parent, VL, false, UserTreeIdx); return; } } @@ -1485,7 +1509,7 @@ // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(Parent, VL, false, UserTreeIdx); return; } @@ -1505,7 +1529,7 @@ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(Parent, VL, false, UserTreeIdx); return; } VL = UniqueValues; @@ -1522,7 +1546,7 @@ assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1543,12 +1567,12 @@ LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1558,7 +1582,7 @@ Operands.push_back(cast(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1569,7 +1593,7 @@ if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(Parent, VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -1586,12 +1610,12 @@ auto StoredCurrentOrderAndNum = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++StoredCurrentOrderAndNum->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + newTreeEntry(Parent, VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); BS.cancelScheduling(VL, VL0); return; } @@ -1607,7 +1631,7 @@ if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1620,7 +1644,7 @@ auto *L = cast(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1650,14 +1674,14 @@ if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(Parent, VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(Parent, VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, I->getFirst()); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } @@ -1667,7 +1691,7 @@ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -1687,13 +1711,13 @@ Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1702,7 +1726,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1716,14 +1740,14 @@ if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1732,7 +1756,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1755,7 +1779,7 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1763,8 +1787,8 @@ if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + buildTree_rec(S.OpValue, Left, Depth + 1, UserTreeIdx); + buildTree_rec(S.OpValue, Right, Depth + 1, UserTreeIdx); return; } @@ -1774,7 +1798,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx); } return; @@ -1784,7 +1808,7 @@ if (cast(VL[j])->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } @@ -1798,7 +1822,7 @@ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } @@ -1810,12 +1834,12 @@ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1823,7 +1847,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1832,19 +1856,19 @@ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; for (Value *j : VL) Operands.push_back(cast(j)->getOperand(0)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx); return; } case Instruction::Call: { @@ -1855,7 +1879,7 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1869,7 +1893,7 @@ getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1880,7 +1904,7 @@ Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument " << A1I << "!=" << A1J << "\n"); return; @@ -1892,14 +1916,14 @@ CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1907,7 +1931,7 @@ CallInst *CI2 = dyn_cast(j); Operands.push_back(CI2->getArgOperand(i)); } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1916,19 +1940,19 @@ // then do not vectorize this instruction. if (!S.isAltShuffle()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; reorderAltShuffleOperands(S, VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + buildTree_rec(S.OpValue, Left, Depth + 1, UserTreeIdx); + buildTree_rec(S.OpValue, Right, Depth + 1, UserTreeIdx); return; } @@ -1938,13 +1962,13 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(S.OpValue, Operands, Depth + 1, UserTreeIdx); } return; default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(Parent, VL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -2413,6 +2437,55 @@ } } +bool BoUpSLP::reduceTree() { + bool Reduced = false; + + // Estimating where to stop vectorization. + unsigned StopAt = 0; + int MinCost = VectorizableTree.back().ExpandAtCost; + for (unsigned I = VectorizableTree.size(); I--;) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + if (VectorizableTree[I].ExpandAtCost < MinCost) { + MinCost = VectorizableTree[I].ExpandAtCost; + Reduced = true; + StopAt = I; + } + } + + // Canceling unprofitable elements. + if (StopAt > 0 && StopAt < (VectorizableTree.size() - 1)) { + int ReducedBy = MinCost - VectorizableTree.back().ExpandAtCost; + LLVM_DEBUG(dbgs() << "SLP: Reduced the tree cost by " << ReducedBy + << " to make it partially vectorizable.\n"); + Reduced = true; + for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) { + TreeEntry *Entry = &VectorizableTree[I]; + if (!Entry->NeedToGather) { + if (I >= StopAt) { + Entry->NeedToGather = true; + for (Value *V : Entry->Scalars) { + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out of proposed to vectorize.\n"); + RemovedOprations.push_back(I); + ScalarToTreeEntry.erase(V); + MustGather.insert(V); + ExternalUses.erase(std::remove_if(ExternalUses.begin(), + ExternalUses.end(), + [&](ExternalUser &EU) { + return EU.Scalar == V; + }), + ExternalUses.end()); + } + } + } + } + } + + return Reduced; +} + bool BoUpSLP::isFullyVectorizableTinyTree() { LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " << VectorizableTree.size() << " is fully vectorizable .\n"); @@ -2457,7 +2530,7 @@ return true; } -int BoUpSLP::getSpillCost() { +int BoUpSLP::getSpillCost(const SmallPtrSetImpl &ScalarsToVec) { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it @@ -2481,7 +2554,7 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && getTreeEntry(&*J)) + if (isa(&*J) && ScalarsToVec.count(&*J) > 0) LiveValues.insert(cast(&*J)); } @@ -2512,23 +2585,52 @@ V.push_back(VectorType::get(II->getType(), BundleWidth)); Cost += TTI->getCostOfKeepingLiveOverCall(V); } - ++PrevInstIt; } - PrevInst = Inst; } return Cost; } -int BoUpSLP::getTreeCost() { - int Cost = 0; - LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " - << VectorizableTree.size() << ".\n"); +int BoUpSLP::getExtractCost(const SmallPtrSetImpl &ScalarsToVec) { + int ExtractCost = 0; + unsigned BundleWidth = VectorizableTree.front().Scalars.size(); + SmallPtrSet ExtractCostCalculated; + for (ExternalUser &EU : ExternalUses) { + + // Avoid non-vectorized scalars for this tree hight. + if (ScalarsToVec.count(EU.Scalar) == 0) + continue; - unsigned BundleWidth = VectorizableTree[0].Scalars.size(); + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(EU.Scalar).second) + continue; + + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); + auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + if (MinBWs.count(ScalarRoot)) { + auto *MinTy = IntegerType::get(F->getContext(), + MinBWs[ScalarRoot].first); + auto Extend = + MinBWs[ScalarRoot].second ? + Instruction::SExt : Instruction::ZExt; + VecTy = VectorType::get(MinTy, BundleWidth); + ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), + VecTy, EU.Lane); + } else { + ExtractCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + } + } + return ExtractCost; +} +int BoUpSLP::getTreeCost(bool ReduceTree) { + SmallDenseMap> GatherMap; for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = VectorizableTree[I]; @@ -2551,60 +2653,62 @@ })) continue; - int C = getEntryCost(&TE); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + if (TE.NeedToGather) + GatherMap[TE.Parent].push_back(I); + + TE.Cost = getEntryCost(&TE); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); - Cost += C; } - SmallPtrSet ExtractCostCalculated; + SmallPtrSet ScalarsToVec; + int CostSum = 0; int ExtractCost = 0; - for (ExternalUser &EU : ExternalUses) { - // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) - continue; - - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) - continue; + int SpillCost = 0; + for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) { + TreeEntry *Entry = &VectorizableTree[I]; - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = VectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + int GatherCost = 0; + if (!Entry->NeedToGather) { + for (Value *V : Entry->Scalars) { + ScalarsToVec.insert(V); + if (GatherMap.find(V) != GatherMap.end()) + for (int Gather : GatherMap[V]) + GatherCost += VectorizableTree[Gather].Cost; + } + CostSum += Entry->Cost + GatherCost; } + Entry->ExpandAtCost = CostSum; + + ExtractCost = getExtractCost(ScalarsToVec); + Entry->ExpandAtCost += ExtractCost; + + SpillCost = getSpillCost(ScalarsToVec); + Entry->ExpandAtCost += SpillCost; } - int SpillCost = getSpillCost(); - Cost += SpillCost + ExtractCost; + int MinCost = VectorizableTree.back().ExpandAtCost; + if (ReduceTree) { + for (unsigned I = VectorizableTree.size(); I--;) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + if (I > 0 && VectorizableTree[I].ExpandAtCost < MinCost) + MinCost = VectorizableTree[I].ExpandAtCost; + } + } std::string Str; { raw_string_ostream OS(Str); - OS << "SLP: Spill Cost = " << SpillCost << ".\n" - << "SLP: Extract Cost = " << ExtractCost << ".\n" - << "SLP: Total Cost = " << Cost << ".\n"; + OS << "SLP: Total Cost = " << MinCost << ".\n"; } LLVM_DEBUG(dbgs() << Str); - if (ViewSLPTree) ViewGraph(this, "SLP" + F->getName(), false, Str); - return Cost; + return MinCost; } int BoUpSLP::getGatherCost(Type *Ty, @@ -2950,7 +3054,7 @@ return Vec; } -Value *BoUpSLP::vectorizeTree(ArrayRef VL) { +Value *BoUpSLP::vectorizeTree(Value *Parent, ArrayRef VL) { InstructionsState S = getSameOpcode(VL); if (S.getOpcode()) { if (TreeEntry *E = getTreeEntry(S.OpValue)) { @@ -3088,7 +3192,7 @@ Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeTree(Operands); + Value *Vec = vectorizeTree(S.OpValue, Operands); NewPhi->addIncoming(Vec, IBB); } @@ -3183,7 +3287,7 @@ setInsertPointAfterBundle(E->Scalars, S); - Value *InVec = vectorizeTree(INVL); + Value *InVec = vectorizeTree(S.OpValue, INVL); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3210,8 +3314,8 @@ setInsertPointAfterBundle(E->Scalars, S); - Value *L = vectorizeTree(LHSV); - Value *R = vectorizeTree(RHSV); + Value *L = vectorizeTree(S.OpValue, LHSV); + Value *R = vectorizeTree(S.OpValue, RHSV); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3244,9 +3348,9 @@ setInsertPointAfterBundle(E->Scalars, S); - Value *Cond = vectorizeTree(CondVec); - Value *True = vectorizeTree(TrueVec); - Value *False = vectorizeTree(FalseVec); + Value *Cond = vectorizeTree(S.OpValue, CondVec); + Value *True = vectorizeTree(S.OpValue, TrueVec); + Value *False = vectorizeTree(S.OpValue, FalseVec); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3293,8 +3397,8 @@ setInsertPointAfterBundle(E->Scalars, S); - Value *LHS = vectorizeTree(LHSVL); - Value *RHS = vectorizeTree(RHSVL); + Value *LHS = vectorizeTree(S.OpValue, LHSVL); + Value *RHS = vectorizeTree(S.OpValue, RHSVL); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3373,7 +3477,7 @@ setInsertPointAfterBundle(E->Scalars, S); - Value *VecValue = vectorizeTree(ScalarStoreValues); + Value *VecValue = vectorizeTree(S.OpValue, ScalarStoreValues); Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); @@ -3404,7 +3508,7 @@ for (Value *V : E->Scalars) Op0VL.push_back(cast(V)->getOperand(0)); - Value *Op0 = vectorizeTree(Op0VL); + Value *Op0 = vectorizeTree(S.OpValue, Op0VL); std::vector OpVecs; for (int j = 1, e = cast(VL0)->getNumOperands(); j < e; @@ -3413,7 +3517,7 @@ for (Value *V : E->Scalars) OpVL.push_back(cast(V)->getOperand(j)); - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(S.OpValue, OpVL); OpVecs.push_back(OpVec); } @@ -3456,7 +3560,7 @@ OpVL.push_back(CEI->getArgOperand(j)); } - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(S.OpValue, OpVL); LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); } @@ -3497,14 +3601,14 @@ if (Instruction::isBinaryOp(S.getOpcode())) { reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars, S); - LHS = vectorizeTree(LHSVL); - RHS = vectorizeTree(RHSVL); + LHS = vectorizeTree(S.OpValue, LHSVL); + RHS = vectorizeTree(S.OpValue, RHSVL); } else { ValueList INVL; for (Value *V : E->Scalars) INVL.push_back(cast(V)->getOperand(0)); setInsertPointAfterBundle(E->Scalars, S); - LHS = vectorizeTree(INVL); + LHS = vectorizeTree(S.OpValue, INVL); } if (E->VectorizedValue) { @@ -3574,7 +3678,12 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (!RemovedOprations.empty()) + removeFromScheduling(BS, RemovedOprations); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); @@ -3702,15 +3811,8 @@ Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { -#ifndef NDEBUG - for (User *U : Scalar->users()) { - LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - - // It is legal to replace users in the ignorelist by undef. - assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && - "Replacing out-of-tree value with undef"); - } -#endif + // The tree might not be fully vectorized, so we don't have to + // check every user. Value *Undef = UndefValue::get(Ty); Scalar->replaceAllUsesWith(Undef); } @@ -4186,6 +4288,33 @@ ReadyInsts.clear(); } +void BoUpSLP::removeFromScheduling(BlockScheduling *BS, + ArrayRef Oprations) { + bool Removed = false; + for (int I : Oprations) { + TreeEntry *Entry = &VectorizableTree[I]; + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (Removed) { + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, + [&](ScheduleData *SD) { SD->clearDependencies(); }); + } + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -4682,7 +4811,16 @@ const unsigned ChainLen = Chain.size(); LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); - const unsigned Sz = R.getVectorElementSize(Chain[0]); + Value *FirstStore = nullptr; + for (Value *V : Chain) { + assert(isa(V) && "Expected only StoreInst here!"); + if (StoreInst *SI = cast(V)) + if (SI->getValueOperand()) + FirstStore = V; + } + if (!FirstStore) + return false; + const unsigned Sz = R.getVectorElementSize(FirstStore); const unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) @@ -4703,13 +4841,20 @@ << "\n"); ArrayRef Operands = Chain.slice(i, VF); + // Skip if any store instruction vectorized. + if (std::any_of(Operands.begin(), Operands.end(), + [](Value *V) { + return (!(cast(V))->getValueOperand()); + })) + continue; + R.buildTree(Operands); if (R.isTreeTinyAndNotFullyVectorizable()) continue; R.computeMinimumValueSizes(); - int Cost = R.getTreeCost(); + int Cost = R.getTreeCost(true); LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); @@ -4724,6 +4869,7 @@ << " and with tree size " << NV("TreeSize", R.getTreeSize())); + R.reduceTree(); R.vectorizeTree(); // Move to the next bundle. Index: test/Transforms/SLPVectorizer/AArch64/matmul.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/matmul.ll +++ test/Transforms/SLPVectorizer/AArch64/matmul.ll @@ -13,62 +13,78 @@ ; CHECK-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[TEMP:%.*]] = load double, double* [[ARRAYIDX1_I]], align 8 ; CHECK-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[TEMP1:%.*]] = load double, double* [[ARRAYIDX3_I]], align 8 +; CHECK-NEXT: [[MUL_I:%.*]] = fmul double [[TEMP]], [[TEMP1]] ; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 0, i64 1 ; CHECK-NEXT: [[TEMP2:%.*]] = load double, double* [[ARRAYIDX5_I]], align 8 ; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 0 +; CHECK-NEXT: [[TEMP3:%.*]] = load double, double* [[ARRAYIDX7_I]], align 8 +; CHECK-NEXT: [[MUL8_I:%.*]] = fmul double [[TEMP2]], [[TEMP3]] ; CHECK-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX3_I]] to <2 x double>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TEMP]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TEMP]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TEMP4:%.*]] = load double, double* [[ARRAYIDX13_I]], align 8 +; CHECK-NEXT: [[MUL14_I:%.*]] = fmul double [[TEMP]], [[TEMP4]] ; CHECK-NEXT: [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7_I]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> undef, double [[TEMP2]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[TEMP2]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP7]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[TEMP5:%.*]] = load double, double* [[ARRAYIDX18_I]], align 8 +; CHECK-NEXT: [[MUL19_I:%.*]] = fmul double [[TEMP2]], [[TEMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[MUL_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[MUL14_I]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[MUL8_I]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[MUL19_I]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 2 +; CHECK-NEXT: [[TEMP6:%.*]] = load double, double* [[ARRAYIDX25_I]], align 8 +; CHECK-NEXT: [[MUL26_I:%.*]] = fmul double [[TEMP]], [[TEMP6]] ; CHECK-NEXT: [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 2 +; CHECK-NEXT: [[TEMP7:%.*]] = load double, double* [[ARRAYIDX30_I]], align 8 +; CHECK-NEXT: [[MUL31_I:%.*]] = fmul double [[TEMP2]], [[TEMP7]] ; CHECK-NEXT: [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 3 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>* -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x double>, <2 x double>* [[TMP12]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[TMP4]], [[TMP13]] +; CHECK-NEXT: [[TEMP8:%.*]] = load double, double* [[ARRAYIDX37_I]], align 8 +; CHECK-NEXT: [[MUL38_I:%.*]] = fmul double [[TEMP]], [[TEMP8]] ; CHECK-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 3 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>* -; CHECK-NEXT: [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[TMP15]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x double> [[TMP9]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = fadd <2 x double> [[TMP14]], [[TMP17]] +; CHECK-NEXT: [[TEMP9:%.*]] = load double, double* [[ARRAYIDX42_I]], align 8 +; CHECK-NEXT: [[MUL43_I:%.*]] = fmul double [[TEMP2]], [[TEMP9]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> undef, double [[MUL26_I]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[MUL38_I]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> undef, double [[MUL31_I]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[MUL43_I]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP7]], [[TMP9]] ; CHECK-NEXT: [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 0 ; CHECK-NEXT: [[TEMP10:%.*]] = load double, double* [[ARRAYIDX47_I]], align 8 +; CHECK-NEXT: [[MUL50_I:%.*]] = fmul double [[TEMP1]], [[TEMP10]] ; CHECK-NEXT: [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 1 ; CHECK-NEXT: [[TEMP11:%.*]] = load double, double* [[ARRAYIDX52_I]], align 8 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> undef, double [[TEMP10]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TEMP10]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = fmul <2 x double> [[TMP2]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> undef, double [[TEMP11]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x double> [[TMP22]], double [[TEMP11]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = fmul <2 x double> [[TMP7]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = fadd <2 x double> [[TMP21]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = fmul <2 x double> [[TMP13]], [[TMP20]] -; CHECK-NEXT: [[TMP27:%.*]] = fmul <2 x double> [[TMP16]], [[TMP23]] -; CHECK-NEXT: [[TMP28:%.*]] = fadd <2 x double> [[TMP26]], [[TMP27]] +; CHECK-NEXT: [[MUL55_I:%.*]] = fmul double [[TEMP3]], [[TEMP11]] +; CHECK-NEXT: [[MUL62_I:%.*]] = fmul double [[TEMP4]], [[TEMP10]] +; CHECK-NEXT: [[MUL67_I:%.*]] = fmul double [[TEMP5]], [[TEMP11]] +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> undef, double [[MUL50_I]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[MUL62_I]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> undef, double [[MUL55_I]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[MUL67_I]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x double> [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[MUL74_I:%.*]] = fmul double [[TEMP6]], [[TEMP10]] +; CHECK-NEXT: [[MUL79_I:%.*]] = fmul double [[TEMP7]], [[TEMP11]] +; CHECK-NEXT: [[MUL86_I:%.*]] = fmul double [[TEMP8]], [[TEMP10]] +; CHECK-NEXT: [[MUL91_I:%.*]] = fmul double [[TEMP9]], [[TEMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x double> undef, double [[MUL74_I]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[MUL86_I]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x double> undef, double [[MUL79_I]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[TMP18]], double [[MUL91_I]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = fadd <2 x double> [[TMP17]], [[TMP19]] ; CHECK-NEXT: [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 1 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast double* [[OUT]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP29]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast double* [[OUT]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP21]], align 8 ; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 2 ; CHECK-NEXT: [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 3 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP18]], <2 x double>* [[TMP30]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP22]], align 8 ; CHECK-NEXT: [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 4 ; CHECK-NEXT: [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 5 -; CHECK-NEXT: [[TMP31:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP25]], <2 x double>* [[TMP31]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP15]], <2 x double>* [[TMP23]], align 8 ; CHECK-NEXT: [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 6 ; CHECK-NEXT: [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 7 -; CHECK-NEXT: [[TMP32:%.*]] = bitcast double* [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP28]], <2 x double>* [[TMP32]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast double* [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP20]], <2 x double>* [[TMP24]], align 8 ; CHECK-NEXT: ret void ; %arrayidx1.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 0 Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -39,7 +39,6 @@ ; CHECK-LABEL: @store_chain_v2i64( ; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1 ; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1 -; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1 ; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8 ; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8 ; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8 @@ -50,8 +49,10 @@ ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8 -; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2_0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2_1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 Index: test/Transforms/SLPVectorizer/SystemZ/pr34619.ll =================================================================== --- test/Transforms/SLPVectorizer/SystemZ/pr34619.ll +++ test/Transforms/SLPVectorizer/SystemZ/pr34619.ll @@ -22,8 +22,16 @@ ; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> undef, [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = ashr <4 x i32> [[TMP7]], ; CHECK-NEXT: [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP13]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP15]], i32 3 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP17]], align 4 ; CHECK-NEXT: unreachable ; entry: Index: test/Transforms/SLPVectorizer/X86/barriercall.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/barriercall.ll +++ test/Transforms/SLPVectorizer/X86/barriercall.ll @@ -8,16 +8,20 @@ ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar() -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[N]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[N]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[N]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> , [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[N:%.*]], 5 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], 9 +; CHECK-NEXT: [[MUL1:%.*]] = mul nsw i32 [[N]], 9 +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[MUL1]], 9 +; CHECK-NEXT: [[MUL4:%.*]] = shl i32 [[N]], 3 +; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[MUL4]], 9 +; CHECK-NEXT: [[MUL7:%.*]] = mul nsw i32 [[N]], 10 +; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[MUL7]], 9 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ADD]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ADD2]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ADD5]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD8]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: ret i32 undef ; entry: Index: test/Transforms/SLPVectorizer/X86/crash_gep.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_gep.ll +++ test/Transforms/SLPVectorizer/X86/crash_gep.ll @@ -14,9 +14,11 @@ ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint i64* [[ADD_PTR]] to i64 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 -; CHECK-NEXT: store i64 [[TMP1]], i64* [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint i64* [[ARRAYIDX]] to i64 -; CHECK-NEXT: store i64 [[TMP2]], i64* [[ADD_PTR]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[ADD_PTR]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: Index: test/Transforms/SLPVectorizer/X86/crash_smallpt.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -30,16 +30,21 @@ ; CHECK-NEXT: br i1 undef, label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]] ; CHECK: cond.false66.us: ; CHECK-NEXT: [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[ADD_I276_US]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double 0xBFA5CC2D1960285F, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> , [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> , [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> undef, [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[ADD_I264_US:%.*]] = fadd double [[ADD_I276_US]], 0.000000e+00 +; CHECK-NEXT: [[ADD4_I267_US:%.*]] = fadd double undef, 0xBFA5CC2D1960285F +; CHECK-NEXT: [[MUL_I254_US:%.*]] = fmul double [[ADD_I264_US]], 1.400000e+02 +; CHECK-NEXT: [[MUL2_I256_US:%.*]] = fmul double [[ADD4_I267_US]], 1.400000e+02 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[MUL_I254_US]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[MUL2_I256_US]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> , [[TMP1]] +; CHECK-NEXT: [[MUL_I_I_US:%.*]] = fmul double undef, [[ADD_I264_US]] +; CHECK-NEXT: [[MUL2_I_I_US:%.*]] = fmul double undef, [[ADD4_I267_US]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[MUL_I_I_US]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[MUL2_I_I_US]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: unreachable ; CHECK: cond.true63.us: ; CHECK-NEXT: unreachable Index: test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll +++ test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll @@ -10,14 +10,15 @@ ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i64*, i64** @a, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64*> undef, i64* [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64*> [[TMP1]], i64* [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x i64*> [[TMP2]], <2 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x i64*> [[TMP3]] to <2 x i64> +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 11 +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint i64* [[ADD_PTR]] to i64 +; CHECK-NEXT: [[ADD_PTR1:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 56 +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint i64* [[ADD_PTR1]] to i64 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64*> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[ADD_PTR]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: Index: test/Transforms/SLPVectorizer/X86/extractcost.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/extractcost.ll +++ test/Transforms/SLPVectorizer/X86/extractcost.ll @@ -7,19 +7,22 @@ define i32 @foo(i32* nocapture %A, i32 %n, i32 %m) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[N]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[N]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[N]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> , [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0 -; CHECK-NEXT: [[EXTERNALUSE1:%.*]] = add nsw i32 [[TMP9]], [[M:%.*]] -; CHECK-NEXT: [[EXTERNALUSE2:%.*]] = mul nsw i32 [[TMP9]], [[M]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[N:%.*]], 5 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], 9 +; CHECK-NEXT: [[MUL1:%.*]] = mul nsw i32 [[N]], 9 +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[MUL1]], 9 +; CHECK-NEXT: [[MUL4:%.*]] = shl i32 [[N]], 3 +; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[MUL4]], 9 +; CHECK-NEXT: [[MUL7:%.*]] = mul nsw i32 [[N]], 10 +; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[MUL7]], 9 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ADD]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ADD2]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ADD5]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD8]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[EXTERNALUSE1:%.*]] = add nsw i32 [[ADD]], [[M:%.*]] +; CHECK-NEXT: [[EXTERNALUSE2:%.*]] = mul nsw i32 [[ADD]], [[M]] ; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[EXTERNALUSE1]], [[EXTERNALUSE2]] ; CHECK-NEXT: ret i32 [[ADD10]] ; Index: test/Transforms/SLPVectorizer/X86/long_chains.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/long_chains.ll +++ test/Transforms/SLPVectorizer/X86/long_chains.ll @@ -12,12 +12,12 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to <2 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i8> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i8> undef, i8 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i8> [[TMP4]], i8 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = sitofp <2 x i8> [[TMP6]] to <2 x double> -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP7]] +; CHECK-NEXT: [[TMP3:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double> +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> , [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> , [[TMP10]] Index: test/Transforms/SLPVectorizer/X86/resched.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/resched.ll +++ test/Transforms/SLPVectorizer/X86/resched.ll @@ -12,70 +12,86 @@ ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SUB_I]] to i8 +; CHECK-NEXT: [[CONV_I_I1199:%.*]] = and i8 [[TMP1]], 1 +; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8 +; CHECK-NEXT: [[CONV_1_I_I:%.*]] = and i8 [[TMP2]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1 +; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[SHR_1_I_I]] to i8 +; CHECK-NEXT: [[CONV_2_I_I:%.*]] = and i8 [[TMP3]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2 +; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[SHR_2_I_I]] to i8 +; CHECK-NEXT: [[CONV_3_I_I:%.*]] = and i8 [[TMP4]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3 +; CHECK-NEXT: [[SHR_3_I_I:%.*]] = lshr i32 [[CONV31_I]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[SHR_3_I_I]] to i8 +; CHECK-NEXT: [[CONV_4_I_I:%.*]] = and i8 [[TMP5]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4 +; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[SHR_4_I_I]] to i8 +; CHECK-NEXT: [[CONV_5_I_I:%.*]] = and i8 [[TMP6]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5 +; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 +; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[SHR_5_I_I]] to i8 +; CHECK-NEXT: [[CONV_6_I_I:%.*]] = and i8 [[TMP7]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6 +; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[SHR_6_I_I]] to i8 +; CHECK-NEXT: [[CONV_7_I_I:%.*]] = and i8 [[TMP8]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], +; CHECK-NEXT: [[SHR_7_I_I:%.*]] = lshr i32 [[CONV31_I]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[SHR_7_I_I]] to i8 +; CHECK-NEXT: [[CONV_8_I_I:%.*]] = and i8 [[TMP9]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8 +; CHECK-NEXT: [[SHR_8_I_I:%.*]] = lshr i32 [[CONV31_I]], 9 +; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[SHR_8_I_I]] to i8 +; CHECK-NEXT: [[CONV_9_I_I:%.*]] = and i8 [[TMP10]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9 +; CHECK-NEXT: [[SHR_9_I_I:%.*]] = lshr i32 [[CONV31_I]], 10 +; CHECK-NEXT: [[TMP11:%.*]] = trunc i32 [[SHR_9_I_I]] to i8 +; CHECK-NEXT: [[CONV_10_I_I:%.*]] = and i8 [[TMP11]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10 +; CHECK-NEXT: [[SHR_10_I_I:%.*]] = lshr i32 [[CONV31_I]], 11 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[SHR_10_I_I]] to i8 +; CHECK-NEXT: [[CONV_11_I_I:%.*]] = and i8 [[TMP12]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], +; CHECK-NEXT: [[SHR_11_I_I:%.*]] = lshr i32 [[CONV31_I]], 12 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[SHR_11_I_I]] to i8 +; CHECK-NEXT: [[CONV_12_I_I:%.*]] = and i8 [[TMP13]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 ; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 +; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[SHR_12_I_I]] to i8 +; CHECK-NEXT: [[CONV_13_I_I:%.*]] = and i8 [[TMP14]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 ; CHECK-NEXT: [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14 +; CHECK-NEXT: [[TMP15:%.*]] = trunc i32 [[SHR_13_I_I]] to i8 +; CHECK-NEXT: [[CONV_14_I_I:%.*]] = and i8 [[TMP15]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 ; CHECK-NEXT: [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15 -; CHECK-NEXT: [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8> -; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i8> , [[TMP43]] +; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[SHR_14_I_I]] to i8 +; CHECK-NEXT: [[CONV_15_I_I:%.*]] = and i8 [[TMP16]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 -; CHECK-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> undef, i8 [[CONV_I_I1199]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i8> [[TMP17]], i8 [[CONV_1_I_I]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i8> [[TMP18]], i8 [[CONV_2_I_I]], i32 2 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> [[TMP19]], i8 [[CONV_3_I_I]], i32 3 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[CONV_4_I_I]], i32 4 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[CONV_5_I_I]], i32 5 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[CONV_6_I_I]], i32 6 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[CONV_7_I_I]], i32 7 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[CONV_8_I_I]], i32 8 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i8> [[TMP25]], i8 [[CONV_9_I_I]], i32 9 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[CONV_10_I_I]], i32 10 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[CONV_11_I_I]], i32 11 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[CONV_12_I_I]], i32 12 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[CONV_13_I_I]], i32 13 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[CONV_14_I_I]], i32 14 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[CONV_15_I_I]], i32 15 +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP32]], <16 x i8>* [[TMP33]], align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void