Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1170,6 +1170,13 @@ SmallVectorImpl &Right, const DataLayout &DL, ScalarEvolution &SE); + + /// Kind of the tree entry. + enum TreeEntryKind { + TEK_Vectorize, /// Entry must be vectorized. + TEK_Gather, /// Entry is the set of gather scalars. + TEK_EEShuffle, /// Entry is the shuffle of the ExtractElementInstructions. + }; struct TreeEntry { using VecTreeTy = SmallVector, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -1190,8 +1197,9 @@ /// The Scalars are vectorized into this value. It is initialized to Null. Value *VectorizedValue = nullptr; - /// Do we need to gather this sequence ? - bool NeedToGather = false; + /// What should be done with the tree entry: vectorization, gathering or + /// shuffling of ExtractElement|InsertElements. + TreeEntryKind VectorizationKind = TEK_Vectorize; /// Does this sequence require some shuffling? SmallVector ReuseShuffleIndices; @@ -1333,7 +1341,19 @@ dbgs() << "Scalars: \n"; for (Value *V : Scalars) dbgs().indent(2) << *V << "\n"; - dbgs() << "NeedToGather: " << NeedToGather << "\n"; + dbgs() << "VectorizationKind: "; + switch (VectorizationKind) { + case TEK_Vectorize: + dbgs() << "vectorize"; + break; + case TEK_EEShuffle: + dbgs() << "extract element shuffle"; + break; + case TEK_Gather: + dbgs() << "gather"; + break; + } + dbgs() << "\n"; dbgs() << "MainOp: " << *MainOp << "\n"; dbgs() << "AltOp: " << *AltOp << "\n"; dbgs() << "VectorizedValue: "; @@ -1366,18 +1386,20 @@ const InstructionsState &S, const EdgeInfo &UserTreeIdx, ArrayRef ReuseShuffleIndices = None, - ArrayRef ReorderIndices = None) { - bool Vectorized = (bool)Bundle; + ArrayRef ReorderIndices = None, + TreeEntryKind Kind = TEK_Gather) { + assert((Bundle.hasValue() || Kind != TEK_Vectorize) && + "non-gather node without scheduling data."); VectorizableTree.push_back(std::make_unique(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); - Last->NeedToGather = !Vectorized; + Last->VectorizationKind = Kind; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; Last->setOperations(S); - if (Vectorized) { + if (Kind == TEK_Vectorize) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); ScalarToTreeEntry[VL[i]] = Last; @@ -1392,7 +1414,7 @@ } assert((!Bundle.getValue() || Lane == VL.size()) && "Bundle and VL out of sync"); - } else { + } else if (Kind == TEK_Gather) { MustGather.insert(VL.begin(), VL.end()); } @@ -2044,7 +2066,7 @@ static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *) { - if (Entry->NeedToGather) + if (Entry->VectorizationKind != BoUpSLP::TEK_Vectorize) return "color=red"; return ""; } @@ -2072,7 +2094,7 @@ TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (Entry->VectorizationKind != TEK_Vectorize) continue; // For each lane: @@ -2109,7 +2131,7 @@ !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); - assert(!UseEntry->NeedToGather && "Bad state"); + assert(UseEntry->VectorizationKind == TEK_Vectorize && "Bad state"); continue; } } @@ -2288,8 +2310,8 @@ } } - TreeEntry *TE = - newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle, S, UserTreeIdx, + ReuseShuffleIndicies, None, TEK_Vectorize); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); // Keeps the reordered operands to avoid code duplication. @@ -2315,7 +2337,7 @@ LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOriginalOrder; newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); + ReuseShuffleIndicies, None, TEK_Vectorize); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; @@ -2337,14 +2359,22 @@ NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++StoredCurrentOrderAndNum->getSecond(); newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, - StoredCurrentOrderAndNum->getFirst()); + ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst(), + TEK_Vectorize); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; Op0.assign(VL.size(), VL0->getOperand(0)); VectorizableTree.back()->setOperand(0, Op0); return; + } else if (ShuffleOrOp == Instruction::ExtractElement && + isShuffle(VL).hasValue()) { + ++NumOpsWantToKeepOriginalOrder; + newTreeEntry(VL, None/*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, None, TEK_EEShuffle); + BS.cancelScheduling(VL, VL0); + LLVM_DEBUG(dbgs() << "SLP: extractelement is shuffle.\n"); + return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -2409,17 +2439,18 @@ if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, - UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, None, TEK_Vectorize); TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); - TreeEntry *TE = - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, I->getFirst()); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, + UserTreeIdx, ReuseShuffleIndicies, + I->getFirst(), TEK_Vectorize); TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } @@ -2458,7 +2489,7 @@ } } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); + ReuseShuffleIndicies, None, TEK_Vectorize); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); TE->setOperandsInOrder(); @@ -2492,7 +2523,7 @@ } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); + ReuseShuffleIndicies, None, TEK_Vectorize); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); ValueList Left, Right; @@ -2540,7 +2571,7 @@ case Instruction::Or: case Instruction::Xor: { TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); + ReuseShuffleIndicies, None, TEK_Vectorize); LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -2607,7 +2638,7 @@ } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); + ReuseShuffleIndicies, None, TEK_Vectorize); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); TE->setOperandsInOrder(); for (unsigned i = 0, e = 2; i < e; ++i) { @@ -2632,7 +2663,7 @@ } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); + ReuseShuffleIndicies, None, TEK_Vectorize); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -2704,7 +2735,7 @@ } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); + ReuseShuffleIndicies, None, TEK_Vectorize); TE->setOperandsInOrder(); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; @@ -2728,7 +2759,7 @@ return; } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); + ReuseShuffleIndicies, None, TEK_Vectorize); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -2884,37 +2915,20 @@ ReuseShuffleCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - if (E->NeedToGather) { + if (E->VectorizationKind == TEK_Gather) { if (allConstant(VL)) return 0; if (isSplat(VL)) { return ReuseShuffleCost + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - if (E->getOpcode() == Instruction::ExtractElement && - allSameType(VL) && allSameBlock(VL)) { - Optional ShuffleKind = isShuffle(VL); - if (ShuffleKind.hasValue()) { - int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); - for (auto *V : VL) { - // If all users of instruction are going to be vectorized and this - // instruction itself is not going to be vectorized, consider this - // instruction as dead and remove its cost from the final cost of the - // vectorized tree. - if (areAllUsersVectorized(cast(V)) && - !ScalarToTreeEntry.count(V)) { - auto *IO = cast( - cast(V)->getIndexOperand()); - Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - IO->getZExtValue()); - } - } - return ReuseShuffleCost + Cost; - } - } return ReuseShuffleCost + getGatherCost(VL); } assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + assert((E->VectorizationKind == TEK_Vectorize || + (E->VectorizationKind == TEK_EEShuffle && + E->getOpcode() == Instruction::ExtractElement)) && + "Must be vectorization entry or ExtractElementInst shuffle."); Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); @@ -2952,7 +2966,26 @@ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - if (!E->NeedToGather) { + if (E->VectorizationKind == TEK_EEShuffle) { + Optional ShuffleKind = isShuffle(VL); + assert(ShuffleKind.hasValue() && "Must be shuffle of extractelements"); + int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); + for (auto *V : VL) { + // If all users of instruction are going to be vectorized and this + // instruction itself is not going to be vectorized, consider this + // instruction as dead and remove its cost from the final cost of the + // vectorized tree. + if (areAllUsersVectorized(cast(V)) && + !ScalarToTreeEntry.count(V)) { + auto *IO = cast( + cast(V)->getIndexOperand()); + Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + IO->getZExtValue()); + } + } + return ReuseShuffleCost + Cost; + } + if (E->VectorizationKind == TEK_Vectorize) { int DeadCost = ReuseShuffleCost; if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. @@ -3231,20 +3264,22 @@ << VectorizableTree.size() << " is fully vectorizable .\n"); // We only handle trees of heights 1 and 2. - if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather) + if (VectorizableTree.size() == 1 && + VectorizableTree[0]->VectorizationKind == TEK_Vectorize) return true; if (VectorizableTree.size() != 2) return false; // Handle splat and all-constants stores. - if (!VectorizableTree[0]->NeedToGather && + if (VectorizableTree[0]->VectorizationKind == TEK_Vectorize && (allConstant(VectorizableTree[1]->Scalars) || isSplat(VectorizableTree[1]->Scalars))) return true; // Gathering cost would be too much for tiny trees. - if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather) + if (VectorizableTree[0]->VectorizationKind != TEK_Vectorize || + VectorizableTree[1]->VectorizationKind != TEK_Vectorize) return false; return true; @@ -3361,12 +3396,13 @@ // their uses. Since such an approach results in fewer total entries, // existing heuristics based on tree size may yield different results. // - if (TE.NeedToGather && - std::any_of( - std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(), - [TE](const std::unique_ptr &EntryPtr) { - return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars); - })) + if (TE.VectorizationKind == TEK_Gather && + std::any_of(std::next(VectorizableTree.begin(), I + 1), + VectorizableTree.end(), + [TE](const std::unique_ptr &EntryPtr) { + return EntryPtr->VectorizationKind == TEK_Gather && + EntryPtr->isSame(TE.Scalars); + })) continue; int C = getEntryCost(&TE); @@ -3654,7 +3690,7 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - if (E->NeedToGather) { + if (E->VectorizationKind == TEK_Gather) { setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { @@ -3668,6 +3704,10 @@ E->VectorizedValue = V; return V; } + assert((E->VectorizationKind == TEK_Vectorize || + (E->VectorizationKind == TEK_EEShuffle && + E->getOpcode() == Instruction::ExtractElement)) && + "Must be vectorization entry or ExtractElementInst shuffle."); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); @@ -3709,7 +3749,7 @@ } case Instruction::ExtractElement: { - if (!E->NeedToGather) { + if (E->VectorizationKind == TEK_Vectorize) { Value *V = E->getSingleOperand(0); if (!E->ReorderIndices.empty()) { OrdersType Mask; @@ -3728,53 +3768,46 @@ E->VectorizedValue = V; return V; } + assert(E->VectorizationKind == TEK_EEShuffle && + "Only shuffle is expected."); setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); + if (E->VectorizationKind != TEK_Vectorize) { + if (auto *I = dyn_cast(V)) { + GatherSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } } } E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { - if (!E->NeedToGather) { - LoadInst *LI = cast(E->getSingleOperand(0)); - Builder.SetInsertPoint(LI); - PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); - Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); - LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment()); - Value *NewV = propagateMetadata(V, E->Scalars); - if (!E->ReorderIndices.empty()) { - OrdersType Mask; - inversePermutation(E->ReorderIndices, Mask); - NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, - "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - NewV = Builder.CreateShuffleVector( - NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = NewV; - return NewV; + assert(E->VectorizationKind == TEK_Vectorize && + "Unable to reuse extractvalues"); + LoadInst *LI = cast(E->getSingleOperand(0)); + Builder.SetInsertPoint(LI); + PointerType *PtrTy = + PointerType::get(VecTy, LI->getPointerAddressSpace()); + Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); + LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment()); + Value *NewV = propagateMetadata(V, E->Scalars); + if (!E->ReorderIndices.empty()) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, + "reorder_shuffle"); } - setInsertPointAfterBundle(E); - auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } + // TODO: Merge this shuffle with the ReorderShuffleMask. + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); } - E->VectorizedValue = V; - return V; + E->VectorizedValue = NewV; + return NewV; } case Instruction::ZExt: case Instruction::SExt: @@ -4204,7 +4237,8 @@ continue; TreeEntry *E = getTreeEntry(Scalar); assert(E && "Invalid scalar"); - assert(!E->NeedToGather && "Extracting from a gather list"); + assert(E->VectorizationKind == TEK_Vectorize && + "Extracting from a gather list"); Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); @@ -4277,7 +4311,7 @@ TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (Entry->VectorizationKind != TEK_Vectorize) continue; assert(Entry->VectorizedValue && "Can't find vectorizable value");