diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2341,10 +2341,14 @@ /// Checks if the gathered \p VL can be represented as shuffle(s) of previous /// tree entries. + /// \param TE Tree entry checked for permutation. + /// \param VL List of scalars (a subset of the TE scalar), checked for + /// permutations. /// \returns ShuffleKind, if gathered values can be represented as shuffles of /// previous tree entries. \p Mask is filled with the shuffle mask. std::optional - isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, + isGatherShuffledEntry(const TreeEntry *TE, ArrayRef VL, + SmallVectorImpl &Mask, SmallVectorImpl &Entries); /// \returns the scalarization cost for this list of values. Assuming that @@ -6696,13 +6700,36 @@ return 0; if (isa(VL[0])) return InstructionCost::getInvalid(); + SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); + // Build a mask out of the reorder indices and reorder scalars per this + // mask. + SmallVector ReorderMask; + inversePermutation(E->ReorderIndices, ReorderMask); + if (!ReorderMask.empty()) + reorderScalars(GatheredScalars, ReorderMask); SmallVector Mask; + std::optional GatherShuffle; SmallVector Entries; - std::optional Shuffle = - isGatherShuffledEntry(E, Mask, Entries); - if (Shuffle) { + // Do not try to look for reshuffled loads for gathered loads (they will be + // handled later), for vectorized scalars, and cases, which are definitely + // not profitable (splats and small gather nodes.) + if (E->getOpcode() != Instruction::Load || E->isAltShuffle() || + all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || + isSplat(E->Scalars) || + (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) + GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); + if (GatherShuffle) { + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != UndefMaskElem) + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } + assert((Entries.size() == 1 || Entries.size() == 2) && + "Expected shuffle of 1 or 2 entries."); InstructionCost GatherCost = 0; - if (ShuffleVectorInst::isIdentityMask(Mask)) { + int Limit = Mask.size() * 2; + if (all_of(Mask, [=](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(Mask)) { // Perfect match in the graph, will reuse the previously vectorized // node. Cost is 0. LLVM_DEBUG( @@ -6721,8 +6748,10 @@ // previously vectorized nodes. Add the cost of the permutation rather // than gather. ::addMask(Mask, E->ReuseShuffleIndices); - GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask); + GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask); } + if (!all_of(GatheredScalars, UndefValue::classof)) + GatherCost += getGatherCost(GatheredScalars); return GatherCost; } if ((E->getOpcode() == Instruction::ExtractElement || @@ -8071,21 +8100,88 @@ } std::optional -BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, +BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef VL, + SmallVectorImpl &Mask, SmallVectorImpl &Entries) { + Entries.clear(); + // No need to check for the topmost gather node. + if (TE == VectorizableTree.front().get()) + return std::nullopt; + Mask.assign(VL.size(), UndefMaskElem); + assert(TE->UserTreeIndices.size() == 1 && + "Expected only single user of the gather node."); // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. - Mask.assign(TE->Scalars.size(), UndefMaskElem); - Entries.clear(); + Instruction &UserInst = + getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE); + auto *PHI = dyn_cast(&UserInst); + auto *NodeUI = DT->getNode( + PHI ? PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx) + : UserInst.getParent()); + assert(NodeUI && "Should only process reachable instructions"); + SmallPtrSet GatheredScalars(VL.begin(), VL.end()); + auto CheckOrdering = [&](Instruction *LastEI) { + // Check if the user node of the TE comes after user node of EntryPtr, + // otherwise EntryPtr depends on TE. + // Gather nodes usually are not scheduled and inserted before their first + // user node. So, instead of checking dependency between the gather nodes + // themselves, we check the dependency between their user nodes. + // If one user node comes before the second one, we cannot use the second + // gather node as the source vector for the first gather node, because in + // the list of instructions it will be emitted later. + auto *EntryParent = LastEI->getParent(); + auto *NodeEUI = DT->getNode(EntryParent); + if (!NodeEUI) + return false; + assert((NodeUI == NodeEUI) == + (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + // Check the order of the gather nodes users. + if (UserInst.getParent() != EntryParent && + (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI))) + return false; + if (UserInst.getParent() == EntryParent && UserInst.comesBefore(LastEI)) + return false; + return true; + }; // Build a lists of values to tree entries. DenseMap> ValueToTEs; for (const std::unique_ptr &EntryPtr : VectorizableTree) { if (EntryPtr.get() == TE) - break; + continue; if (EntryPtr->State != TreeEntry::NeedToGather) continue; + if (!any_of(EntryPtr->Scalars, [&GatheredScalars](Value *V) { + return GatheredScalars.contains(V); + })) + continue; + assert(EntryPtr->UserTreeIndices.size() == 1 && + "Expected only single user of the gather node."); + Instruction &EntryUserInst = + getLastInstructionInBundle(EntryPtr->UserTreeIndices.front().UserTE); + if (&UserInst == &EntryUserInst) { + // If 2 gathers are operands of the same entry, compare operands indices, + // use the earlier one as the base. + if (TE->UserTreeIndices.front().UserTE == + EntryPtr->UserTreeIndices.front().UserTE && + TE->UserTreeIndices.front().EdgeIdx < + EntryPtr->UserTreeIndices.front().EdgeIdx) + continue; + } + // Check if the user node of the TE comes after user node of EntryPtr, + // otherwise EntryPtr depends on TE. + auto *EntryPHI = dyn_cast(&EntryUserInst); + auto *EntryI = + EntryPHI + ? EntryPHI + ->getIncomingBlock(EntryPtr->UserTreeIndices.front().EdgeIdx) + ->getTerminator() + : &EntryUserInst; + if (!CheckOrdering(EntryI)) + continue; for (Value *V : EntryPtr->Scalars) - ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get()); + if (!isConstant(V)) + ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get()); } // Find all tree entries used by the gathered values. If no common entries // found - not a shuffle. @@ -8097,7 +8193,7 @@ SmallVector> UsedTEs; DenseMap UsedValuesEntry; for (Value *V : TE->Scalars) { - if (isa(V)) + if (isConstant(V)) continue; // Build a list of tree entries where V is used. SmallPtrSet VToTEs; @@ -8107,10 +8203,11 @@ if (const TreeEntry *VTE = getTreeEntry(V)) VToTEs.insert(VTE); if (VToTEs.empty()) - return std::nullopt; + continue; if (UsedTEs.empty()) { // The first iteration, just insert the list of nodes to vector. UsedTEs.push_back(VToTEs); + UsedValuesEntry.try_emplace(V, 0); } else { // Need to check if there are any previously used tree nodes which use V. // If there are no such nodes, consider that we have another one input @@ -8135,8 +8232,9 @@ if (Idx == UsedTEs.size()) { // If the number of input vectors is greater than 2 - not a permutation, // fallback to the regular gather. + // TODO: support multiple reshuffled nodes. if (UsedTEs.size() == 2) - return std::nullopt; + continue; UsedTEs.push_back(SavedVToTEs); Idx = UsedTEs.size() - 1; } @@ -8144,32 +8242,55 @@ } } - if (UsedTEs.empty()) { - assert(all_of(TE->Scalars, UndefValue::classof) && - "Expected vector of undefs only."); + if (UsedTEs.empty()) return std::nullopt; - } unsigned VF = 0; if (UsedTEs.size() == 1) { + // Keep the order to avoid non-determinism. + SmallVector FirstEntries(UsedTEs.front().begin(), + UsedTEs.front().end()); + sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + }); // Try to find the perfect match in another gather node at first. - auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) { - return EntryPtr->isSame(TE->Scalars); + auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) { + return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars); }); - if (It != UsedTEs.front().end()) { + if (It != FirstEntries.end()) { Entries.push_back(*It); std::iota(Mask.begin(), Mask.end(), 0); + // Clear undef scalars. + for (int I = 0, Sz = VL.size(); I < Sz; ++I) + if (isa(TE->Scalars[I])) + Mask[I] = UndefMaskElem; return TargetTransformInfo::SK_PermuteSingleSrc; } - // No perfect match, just shuffle, so choose the first tree node. - Entries.push_back(*UsedTEs.front().begin()); + // No perfect match, just shuffle, so choose the first tree node from the + // tree. + Entries.push_back(FirstEntries.front()); } else { // Try to find nodes with the same vector factor. assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); + // Keep the order of tree nodes to avoid non-determinism. DenseMap VFToTE; - for (const TreeEntry *TE : UsedTEs.front()) - VFToTE.try_emplace(TE->getVectorFactor(), TE); - for (const TreeEntry *TE : UsedTEs.back()) { + for (const TreeEntry *TE : UsedTEs.front()) { + unsigned VF = TE->getVectorFactor(); + auto It = VFToTE.find(VF); + if (It != VFToTE.end()) { + if (It->second->Idx > TE->Idx) + It->getSecond() = TE; + continue; + } + VFToTE.try_emplace(VF, TE); + } + // Same, keep the order to avoid non-determinism. + SmallVector SecondEntries(UsedTEs.back().begin(), + UsedTEs.back().end()); + sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + }); + for (const TreeEntry *TE : SecondEntries) { auto It = VFToTE.find(TE->getVectorFactor()); if (It != VFToTE.end()) { VF = It->first; @@ -8184,28 +8305,122 @@ return std::nullopt; } + bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, UndefValue::classof); + // Checks if the 2 PHIs are compatible in terms of high possibility to be + // vectorized. + auto AreCompatiblePHIs = [&](Value *V, Value *V1) { + auto *PHI = cast(V); + auto *PHI1 = cast(V1); + // Check that all incoming values are compatible/from same parent (if they + // are instructions). + // The incoming values are compatible if they all are constants, or + // instruction with the same/alternate opcodes from the same basic block. + for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) { + Value *In = PHI->getIncomingValue(I); + Value *In1 = PHI1->getIncomingValue(I); + if (isConstant(In) && isConstant(In1)) + continue; + if (!getSameOpcode({In, In1}, *TLI).getOpcode()) + return false; + if (cast(In)->getParent() != + cast(In1)->getParent()) + return false; + } + return true; + }; + // Check if the value can be ignored during analysis for shuffled gathers. + // We suppose it is better to ignore instruction, which do not form splats, + // are not vectorized/not extractelements (these instructions will be handled + // by extractelements processing) or may form vector node in future. + auto MightBeIgnored = [=](Value *V) { + auto *I = dyn_cast(V); + SmallVector IgnoredVals; + if (UserIgnoreList) + IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) && + !isVectorLikeInstWithConstOps(I) && + !areAllUsersVectorized(I, IgnoredVals) && isSimple(I); + }; + // Check that the neighbor instruction may form a full vector node with the + // current instruction V. It is possible, if they have same/alternate opcode + // and same parent basic block. + auto NeighborMightBeIgnored = [&](Value *V, int Idx) { + Value *V1 = VL[Idx]; + bool UsedInSameVTE = false; + auto It = UsedValuesEntry.find(V1); + if (It != UsedValuesEntry.end()) + UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second; + return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE && + getSameOpcode({V, V1}, *TLI).getOpcode() && + cast(V)->getParent() == + cast(V1)->getParent() && + (!isa(V1) || AreCompatiblePHIs(V, V1)); + }; // Build a shuffle mask for better cost estimation and vector emission. - for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { - Value *V = TE->Scalars[I]; - if (isa(V)) + SmallBitVector UsedIdxs(Entries.size()); + SmallVector> EntryLanes; + for (int I = 0, E = VL.size(); I < E; ++I) { + Value *V = VL[I]; + auto It = UsedValuesEntry.find(V); + if (It == UsedValuesEntry.end()) continue; - unsigned Idx = UsedValuesEntry.lookup(V); - const TreeEntry *VTE = Entries[Idx]; - int FoundLane = VTE->findLaneForValue(V); - Mask[I] = Idx * VF + FoundLane; - // Extra check required by isSingleSourceMaskImpl function (called by - // ShuffleVectorInst::isSingleSourceMask). - if (Mask[I] >= 2 * E) - return std::nullopt; + // Do not try to shuffle scalars, if they are constants, or instructions + // that can be vectorized as a result of the following vector build + // vectorization. + if (isConstant(V) || (MightBeIgnored(V) && + ((I > 0 && NeighborMightBeIgnored(V, I - 1)) || + (I != E - 1 && NeighborMightBeIgnored(V, I + 1))))) + continue; + unsigned Idx = It->second; + EntryLanes.emplace_back(Idx, I); + UsedIdxs.set(Idx); + } + // Iterate through all shuffled scalars and select entries, which can be used + // for final shuffle. + SmallVector TempEntries; + for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) { + if (!UsedIdxs.test(I)) + continue; + // Fix the entry number for the given scalar. If it is the first entry, set + // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes). + // These indices are used when calculating final shuffle mask as the vector + // offset. + for (std::pair &Pair : EntryLanes) + if (Pair.first == I) + Pair.first = TempEntries.size(); + TempEntries.push_back(Entries[I]); + } + Entries.swap(TempEntries); + if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) { + // We may have here 1 or 2 entries only. If the number of scalars is equal + // to the number of entries, no need to do the analysis, it is not very + // profitable. Since VL is not the same as TE->Scalars, it means we already + // have some shuffles before. Cut off not profitable case. + Entries.clear(); + return std::nullopt; + } + // Build the final mask, check for the identity shuffle, if possible. + bool IsIdentity = Entries.size() == 1; + // Pair.first is the offset to the vector, while Pair.second is the index of + // scalar in the list. + for (const std::pair &Pair : EntryLanes) { + Mask[Pair.second] = Pair.first * VF + + Entries[Pair.first]->findLaneForValue(VL[Pair.second]); + IsIdentity &= Mask[Pair.second] == Pair.second; } switch (Entries.size()) { case 1: - return TargetTransformInfo::SK_PermuteSingleSrc; + if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) + return TargetTransformInfo::SK_PermuteSingleSrc; + break; case 2: - return TargetTransformInfo::SK_PermuteTwoSrc; + if (EntryLanes.size() > 2 || VL.size() <= 2) + return TargetTransformInfo::SK_PermuteTwoSrc; + break; default: break; } + Entries.clear(); return std::nullopt; } @@ -8966,11 +9181,18 @@ } if (E->getMainOp()) setInsertPointAfterBundle(E); + SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); + // Build a mask out of the reorder indices and reorder scalars per this + // mask. + SmallVector ReorderMask; + inversePermutation(E->ReorderIndices, ReorderMask); + if (!ReorderMask.empty()) + reorderScalars(GatheredScalars, ReorderMask); Value *Vec; SmallVector Mask; SmallVector Entries; std::optional Shuffle = - isGatherShuffledEntry(E, Mask, Entries); + isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); if (Shuffle) { assert((Entries.size() == 1 || Entries.size() == 2) && "Expected shuffle of 1 or 2 entries."); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -slp-threshold=-3 -S -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-threshold=-2 -S -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-4 | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-6 -slp-min-tree-size=5 | FileCheck %s --check-prefix=FORCE_REDUCTION +; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-4 -slp-min-tree-size=5 | FileCheck %s --check-prefix=FORCE_REDUCTION define void @Test(i32) { ; CHECK-LABEL: @Test( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll @@ -15,15 +15,15 @@ define i32 @fn1() { ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @b, align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr getelementptr inbounds ([[STRUCT_DSTATE:%.*]], ptr @b, i32 0, i32 1), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @d, align 4 -; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP2]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr @b, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @d, align 4 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[COND]], label [[SW_BB:%.*]], label [[SAVE_STATE_AND_RETURN:%.*]] ; CHECK: sw.bb: -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr @c, align 4 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP3]], 7 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @c, align 4 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP2]], 7 ; CHECK-NEXT: store i32 [[AND]], ptr @a, align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> , <2 x i32> [[TMP0]], <2 x i32> ; CHECK-NEXT: switch i32 [[AND]], label [[IF_END:%.*]] [ ; CHECK-NEXT: i32 7, label [[SAVE_STATE_AND_RETURN]] ; CHECK-NEXT: i32 0, label [[SAVE_STATE_AND_RETURN]] @@ -31,10 +31,8 @@ ; CHECK: if.end: ; CHECK-NEXT: br label [[SAVE_STATE_AND_RETURN]] ; CHECK: save_state_and_return: -; CHECK-NEXT: [[T_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP0]], [[SW_BB]] ], [ [[TMP0]], [[SW_BB]] ] -; CHECK-NEXT: [[F_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[TMP1]], [[ENTRY]] ], [ 0, [[SW_BB]] ], [ 0, [[SW_BB]] ] -; CHECK-NEXT: store i32 [[T_0]], ptr @b, align 4 -; CHECK-NEXT: store i32 [[F_0]], ptr getelementptr inbounds ([[STRUCT_DSTATE]], ptr @b, i32 0, i32 1), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[IF_END]] ], [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP3]], [[SW_BB]] ], [ [[TMP3]], [[SW_BB]] ] +; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr @b, align 4 ; CHECK-NEXT: ret i32 undef ; entry: