diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2477,12 +2477,13 @@ /// \param TE Tree entry checked for permutation. /// \param VL List of scalars (a subset of the TE scalar), checked for /// permutations. - /// \returns ShuffleKind, if gathered values can be represented as shuffles of - /// previous tree entries. \p Mask is filled with the shuffle mask. - std::optional - isGatherShuffledEntry(const TreeEntry *TE, ArrayRef VL, - SmallVectorImpl &Mask, - SmallVectorImpl &Entries); + /// \returns per-register series of ShuffleKind, if gathered values can be + /// represented as shuffles of previous tree entries. \p Mask is filled with + /// the shuffle mask (also on per-register base). + SmallVector> isGatherShuffledEntry( + const TreeEntry *TE, ArrayRef VL, SmallVectorImpl &Mask, + SmallVectorImpl> &Entries, + unsigned NumParts); /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -6733,13 +6734,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { bool IsFinalized = false; SmallVector CommonMask; - SmallVector , 2> InVectors; + SmallVector, 2> InVectors; const TargetTransformInfo &TTI; InstructionCost Cost = 0; ArrayRef VectorizedVals; BoUpSLP &R; SmallPtrSetImpl &CheckedExtracts; constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + bool SameNodesEstimated = true; InstructionCost getBuildVectorCost(ArrayRef VL, Value *Root) { if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof)) @@ -6928,6 +6930,47 @@ } return Cost; } + /// Transforms mask \p CommonMask per given \p Mask to make proper set after + /// shuffle emission. + static void transformMaskAfterShuffle(MutableArrayRef CommonMask, + ArrayRef Mask) { + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != PoisonMaskElem) + CommonMask[Idx] = Idx; + } + void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2, + ArrayRef Mask, unsigned Part, + unsigned SliceSize) { + if (SameNodesEstimated) { + // Delay cost estimation of the same nodes are reshuffling. + if ((InVectors.size() == 2 && + InVectors.front().get() == &E1 && + InVectors.back().get() == E2) || + (!E2 && InVectors.size() == 1 && + InVectors.front().get() == &E1)) { + ArrayRef SubMask = + ArrayRef(Mask).slice(Part * SliceSize, SliceSize); + copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part)); + return; + } + // Found non-matching nodes - need to estimate the cost for the matched + // and transform mask. + for (unsigned I = 0; I < Part; ++I) { + // Ignore empty (all poisons) submasks. + ArrayRef SubMask = + ArrayRef(CommonMask).slice(I * SliceSize, SliceSize); + if (all_of(SubMask, [](int Idx) { return Idx == PoisonMaskElem; })) + continue; + Cost += createShuffle( + InVectors.front(), + InVectors.size() == 1 ? nullptr : InVectors.back(), SubMask); + } + transformMaskAfterShuffle(CommonMask, CommonMask); + } + SameNodesEstimated = false; + Cost += createShuffle(&E1, E2, Mask); + transformMaskAfterShuffle(CommonMask, Mask); + } class ShuffleCostBuilder { const TargetTransformInfo &TTI; @@ -7111,22 +7154,65 @@ // into a vector and can be represented as a permutation elements in a // single input vector or of 2 input vectors. Cost += computeExtractCost(VL, Mask, ShuffleKind); + InVectors.assign(1, E); + CommonMask.assign(Mask.begin(), Mask.end()); + transformMaskAfterShuffle(CommonMask, CommonMask); + SameNodesEstimated = false; return VecBase; } - void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef Mask) { - CommonMask.assign(Mask.begin(), Mask.end()); - InVectors.assign({E1, E2}); + void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef Mask) { + if (InVectors.empty()) { + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign({&E1, &E2}); + return; + } + assert(!CommonMask.empty() && "Expected non-empty common mask."); + auto *MaskVecTy = + FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size()); + unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); + assert(NumParts > 0 && NumParts < Mask.size() && + "Expected positive number of registers."); + unsigned SliceSize = Mask.size() / NumParts; + const auto *It = + find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; }); + unsigned Part = std::distance(Mask.begin(), It) / SliceSize; + estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize); } - void add(const TreeEntry *E1, ArrayRef Mask) { - CommonMask.assign(Mask.begin(), Mask.end()); - InVectors.assign(1, E1); + void add(const TreeEntry &E1, ArrayRef Mask) { + if (InVectors.empty()) { + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign(1, &E1); + return; + } + assert(!CommonMask.empty() && "Expected non-empty common mask."); + auto *MaskVecTy = + FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size()); + unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); + assert(NumParts > 0 && NumParts < Mask.size() && + "Expected positive number of registers."); + unsigned SliceSize = Mask.size() / NumParts; + const auto *It = + find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; }); + unsigned Part = std::distance(Mask.begin(), It) / SliceSize; + estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize); + if (!SameNodesEstimated && InVectors.size() == 1) + InVectors.emplace_back(&E1); } /// Adds another one input vector and the mask for the shuffling. void add(Value *V1, ArrayRef Mask) { - assert(CommonMask.empty() && InVectors.empty() && - "Expected empty input mask/vectors."); - CommonMask.assign(Mask.begin(), Mask.end()); - InVectors.assign(1, V1); + if (InVectors.empty()) { + assert(CommonMask.empty() && "Expected empty input mask/vectors."); + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign(1, V1); + return; + } + assert(InVectors.size() == 1 && InVectors.front().is() && + !CommonMask.empty() && "Expected only single entry from extracts."); + InVectors.push_back(V1); + unsigned VF = CommonMask.size(); + for (unsigned Idx = 0; Idx < VF; ++Idx) + if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) + CommonMask[Idx] = Mask[Idx] + VF; } Value *gather(ArrayRef VL, Value *Root = nullptr) { Cost += getBuildVectorCost(VL, Root); @@ -7235,8 +7321,8 @@ SmallVector Mask; SmallVector ExtractMask; std::optional ExtractShuffle; - std::optional GatherShuffle; - SmallVector Entries; + SmallVector> GatherShuffle; + SmallVector> Entries; Type *ScalarTy = GatheredScalars.front()->getType(); // Check for gathered extracts. ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); @@ -7245,14 +7331,24 @@ IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); bool Resized = false; + unsigned NumParts = TTI->getNumberOfParts(FixedVectorType::get( + GatheredScalars.front()->getType(), GatheredScalars.size())); + if (NumParts == 0 || NumParts >= GatheredScalars.size()) + NumParts = 1; if (Value *VecBase = Estimator.adjustExtracts( - E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) + E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) { if (auto *VecBaseTy = dyn_cast(VecBase->getType())) if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { Resized = true; GatheredScalars.append(VF - GatheredScalars.size(), PoisonValue::get(ScalarTy)); } + } else if (auto *VecTy = + FixedVectorType::get(VL.front()->getType(), VL.size()); + ExtractShuffle && + TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) { + copy(VL, GatheredScalars.begin()); + } // Do not try to look for reshuffled loads for gathered loads (they will be // handled later), for vectorized scalars, and cases, which are definitely @@ -7262,24 +7358,28 @@ all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) - GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); - if (GatherShuffle) { - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - if (*GatherShuffle == TTI::SK_PermuteSingleSrc && - Entries.front()->isSame(E->Scalars)) { + GatherShuffle = + isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts); + if (!GatherShuffle.empty()) { + if (GatherShuffle.size() == 1 && + *GatherShuffle.front() == TTI::SK_PermuteSingleSrc && + Entries.front().front()->isSame(E->Scalars)) { // Perfect match in the graph, will reuse the previously vectorized // node. Cost is 0. LLVM_DEBUG( dbgs() << "SLP: perfect diamond match for gather bundle that starts with " << *VL.front() << ".\n"); + (void)Estimator.finalize(std::nullopt); return 0; } if (!Resized) { - unsigned VF1 = Entries.front()->getVectorFactor(); - unsigned VF2 = Entries.back()->getVectorFactor(); - if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) + if (GatheredScalars.size() != VF && + any_of(Entries, [&](ArrayRef TEs) { + return any_of(TEs, [&](const TreeEntry *TE) { + return TE->getVectorFactor() == VF; + }); + })) GatheredScalars.append(VF - GatheredScalars.size(), PoisonValue::get(ScalarTy)); } @@ -7291,10 +7391,24 @@ LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() << " entries for bundle that starts with " << *VL.front() << ".\n";); - if (Entries.size() == 1) - Estimator.add(Entries.front(), Mask); - else - Estimator.add(Entries.front(), Entries.back(), Mask); + unsigned SliceSize = E->Scalars.size() / NumParts; + SmallVector VecMask(Mask.size(), PoisonMaskElem); + for (const auto [I, TEs] : enumerate(Entries)) { + if (TEs.empty()) { + assert(!GatherShuffle[I] && + "No shuffles with empty entries list expected."); + continue; + } + assert((TEs.size() == 1 || TEs.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize); + VecMask.assign(VecMask.size(), PoisonMaskElem); + copy(SubMask, std::next(VecMask.begin(), I * SliceSize)); + if (TEs.size() == 1) + Estimator.add(*TEs.front(), VecMask); + else + Estimator.add(*TEs.front(), *TEs.back(), VecMask); + } if (all_of(GatheredScalars, PoisonValue ::classof)) return Estimator.finalize(E->ReuseShuffleIndices); return Estimator.finalize( @@ -7318,8 +7432,6 @@ std::iota(ReuseMask.begin(), ReuseMask.end(), 0); Estimator.add(BV, ReuseMask); } - if (ExtractShuffle) - Estimator.add(E, std::nullopt); return Estimator.finalize(E->ReuseShuffleIndices); } InstructionCost CommonCost = 0; @@ -8583,14 +8695,16 @@ return Cost; } -std::optional -BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef VL, - SmallVectorImpl &Mask, - SmallVectorImpl &Entries) { +SmallVector> BoUpSLP::isGatherShuffledEntry( + const TreeEntry *TE, ArrayRef VL, SmallVectorImpl &Mask, + SmallVectorImpl> &Entries, + unsigned NumParts) { + assert(NumParts > 0 && NumParts < VL.size() && + "Expectedpoistive number of registers."); Entries.clear(); // No need to check for the topmost gather node. if (TE == VectorizableTree.front().get()) - return std::nullopt; + return {}; Mask.assign(VL.size(), PoisonMaskElem); assert(TE->UserTreeIndices.size() == 1 && "Expected only single user of the gather node."); @@ -8609,7 +8723,6 @@ } auto *NodeUI = DT->getNode(ParentBB); assert(NodeUI && "Should only process reachable instructions"); - SmallPtrSet GatheredScalars(VL.begin(), VL.end()); auto CheckOrdering = [&](Instruction *LastEI) { // Check if the user node of the TE comes after user node of EntryPtr, // otherwise EntryPtr depends on TE. @@ -8634,293 +8747,321 @@ return false; return true; }; - // Find all tree entries used by the gathered values. If no common entries - // found - not a shuffle. - // Here we build a set of tree nodes for each gathered value and trying to - // find the intersection between these sets. If we have at least one common - // tree node for each gathered value - we have just a permutation of the - // single vector. If we have 2 different sets, we're in situation where we - // have a permutation of 2 input vectors. - SmallVector> UsedTEs; - DenseMap UsedValuesEntry; - for (Value *V : VL) { - if (isConstant(V)) - continue; - // Build a list of tree entries where V is used. - SmallPtrSet VToTEs; - for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) { - if (TEPtr == TE) + unsigned SliceSize = VL.size() / NumParts; + SmallVector> Res; + for (unsigned Part = 0; Part < NumParts; ++Part) { + ArrayRef SubVL = VL.slice(Part * SliceSize, SliceSize); + SmallPtrSet GatheredScalars(SubVL.begin(), SubVL.end()); + // Find all tree entries used by the gathered values. If no common entries + // found - not a shuffle. + // Here we build a set of tree nodes for each gathered value and trying to + // find the intersection between these sets. If we have at least one common + // tree node for each gathered value - we have just a permutation of the + // single vector. If we have 2 different sets, we're in situation where we + // have a permutation of 2 input vectors. + SmallVector> UsedTEs; + DenseMap UsedValuesEntry; + for (Value *V : SubVL) { + if (isConstant(V)) continue; - if (!any_of(TEPtr->Scalars, [&GatheredScalars](Value *V) { - return GatheredScalars.contains(V); - })) - continue; - assert(TEPtr->UserTreeIndices.size() == 1 && - "Expected only single user of the gather node."); - Instruction &EntryUserInst = - getLastInstructionInBundle(TEPtr->UserTreeIndices.front().UserTE); - PHINode *EntryPHI = - dyn_cast(TEPtr->UserTreeIndices.front().UserTE->getMainOp()); - if (&UserInst == &EntryUserInst && !EntryPHI) { - // If 2 gathers are operands of the same entry, compare operands - // indices, use the earlier one as the base. - if (TE->UserTreeIndices.front().UserTE == - TEPtr->UserTreeIndices.front().UserTE && - TE->UserTreeIndices.front().EdgeIdx < - TEPtr->UserTreeIndices.front().EdgeIdx) + // Build a list of tree entries where V is used. + SmallPtrSet VToTEs; + for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) { + if (TEPtr == TE) continue; + if (!any_of(TEPtr->Scalars, [&GatheredScalars](Value *V) { + return GatheredScalars.contains(V); + })) + continue; + assert(TEPtr->UserTreeIndices.size() == 1 && + "Expected only single user of the gather node."); + Instruction &EntryUserInst = + getLastInstructionInBundle(TEPtr->UserTreeIndices.front().UserTE); + PHINode *EntryPHI = dyn_cast( + TEPtr->UserTreeIndices.front().UserTE->getMainOp()); + if (&UserInst == &EntryUserInst && !EntryPHI) { + // If 2 gathers are operands of the same entry, compare operands + // indices, use the earlier one as the base. + if (TE->UserTreeIndices.front().UserTE == + TEPtr->UserTreeIndices.front().UserTE && + TE->UserTreeIndices.front().EdgeIdx < + TEPtr->UserTreeIndices.front().EdgeIdx) + continue; + } + // Check if the user node of the TE comes after user node of EntryPtr, + // otherwise EntryPtr depends on TE. + auto *EntryI = + EntryPHI + ? EntryPHI + ->getIncomingBlock(TEPtr->UserTreeIndices.front().EdgeIdx) + ->getTerminator() + : &EntryUserInst; + if (!CheckOrdering(EntryI) && + (ParentBB != EntryI->getParent() || + TE->UserTreeIndices.front().UserTE != + TEPtr->UserTreeIndices.front().UserTE || + TE->UserTreeIndices.front().EdgeIdx < + TEPtr->UserTreeIndices.front().EdgeIdx)) + continue; + VToTEs.insert(TEPtr); } - // Check if the user node of the TE comes after user node of EntryPtr, - // otherwise EntryPtr depends on TE. - auto *EntryI = EntryPHI - ? EntryPHI - ->getIncomingBlock( - TEPtr->UserTreeIndices.front().EdgeIdx) - ->getTerminator() - : &EntryUserInst; - if (!CheckOrdering(EntryI) && - (ParentBB != EntryI->getParent() || - TE->UserTreeIndices.front().UserTE != - TEPtr->UserTreeIndices.front().UserTE || - TE->UserTreeIndices.front().EdgeIdx < - TEPtr->UserTreeIndices.front().EdgeIdx)) - continue; - VToTEs.insert(TEPtr); - } - if (const TreeEntry *VTE = getTreeEntry(V)) { - Instruction &EntryUserInst = getLastInstructionInBundle(VTE); - if (&EntryUserInst == &UserInst || !CheckOrdering(&EntryUserInst)) + if (const TreeEntry *VTE = getTreeEntry(V)) { + Instruction &EntryUserInst = getLastInstructionInBundle(VTE); + if (&EntryUserInst == &UserInst || !CheckOrdering(&EntryUserInst)) + continue; + VToTEs.insert(VTE); + } + if (VToTEs.empty()) continue; - VToTEs.insert(VTE); + if (UsedTEs.empty()) { + // The first iteration, just insert the list of nodes to vector. + UsedTEs.push_back(VToTEs); + UsedValuesEntry.try_emplace(V, 0); + } else { + // Need to check if there are any previously used tree nodes which use + // V. If there are no such nodes, consider that we have another one + // input vector. + SmallPtrSet SavedVToTEs(VToTEs); + unsigned Idx = 0; + for (SmallPtrSet &Set : UsedTEs) { + // Do we have a non-empty intersection of previously listed tree + // entries and tree entries using current V? + set_intersect(VToTEs, Set); + if (!VToTEs.empty()) { + // Yes, write the new subset and continue analysis for the next + // scalar. + Set.swap(VToTEs); + break; + } + VToTEs = SavedVToTEs; + ++Idx; + } + // No non-empty intersection found - need to add a second set of + // possible source vectors. + if (Idx == UsedTEs.size()) { + // If the number of input vectors is greater than 2 - not a + // permutation, fallback to the regular gather. + // TODO: support multiple reshuffled nodes. + if (UsedTEs.size() == 2) + continue; + UsedTEs.push_back(SavedVToTEs); + Idx = UsedTEs.size() - 1; + } + UsedValuesEntry.try_emplace(V, Idx); + } } - if (VToTEs.empty()) - continue; + if (UsedTEs.empty()) { - // The first iteration, just insert the list of nodes to vector. - UsedTEs.push_back(VToTEs); - UsedValuesEntry.try_emplace(V, 0); + Res.push_back(std::nullopt); + Entries.emplace_back(); + continue; + } + + unsigned VF = 0; + if (UsedTEs.size() == 1) { + // Keep the order to avoid non-determinism. + SmallVector FirstEntries(UsedTEs.front().begin(), + UsedTEs.front().end()); + sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + }); + // Try to find the perfect match in another gather node at first. + auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) { + return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars); + }); + if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) { + std::iota(Mask.begin(), Mask.end(), 0); + // Clear undef scalars. + for (int I = 0, Sz = VL.size(); I < Sz; ++I) + if (isa(VL[I])) + Mask[I] = PoisonMaskElem; + Entries.emplace_back(1, *It); + Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc); + return Res; + } + // No perfect match, just shuffle, so choose the first tree node from the + // tree. + Entries.emplace_back(1, FirstEntries.front()); } else { - // Need to check if there are any previously used tree nodes which use V. - // If there are no such nodes, consider that we have another one input - // vector. - SmallPtrSet SavedVToTEs(VToTEs); - unsigned Idx = 0; - for (SmallPtrSet &Set : UsedTEs) { - // Do we have a non-empty intersection of previously listed tree entries - // and tree entries using current V? - set_intersect(VToTEs, Set); - if (!VToTEs.empty()) { - // Yes, write the new subset and continue analysis for the next - // scalar. - Set.swap(VToTEs); + // Try to find nodes with the same vector factor. + assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); + // Keep the order of tree nodes to avoid non-determinism. + DenseMap VFToTE; + for (const TreeEntry *TE : UsedTEs.front()) { + unsigned VF = TE->getVectorFactor(); + auto It = VFToTE.find(VF); + if (It != VFToTE.end()) { + if (It->second->Idx > TE->Idx) + It->getSecond() = TE; + continue; + } + VFToTE.try_emplace(VF, TE); + } + // Same, keep the order to avoid non-determinism. + SmallVector SecondEntries(UsedTEs.back().begin(), + UsedTEs.back().end()); + sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + }); + for (const TreeEntry *TE : SecondEntries) { + auto It = VFToTE.find(TE->getVectorFactor()); + if (It != VFToTE.end()) { + VF = It->first; + Entries.emplace_back( + std::initializer_list({It->second, TE})); break; } - VToTEs = SavedVToTEs; - ++Idx; } - // No non-empty intersection found - need to add a second set of possible - // source vectors. - if (Idx == UsedTEs.size()) { - // If the number of input vectors is greater than 2 - not a permutation, - // fallback to the regular gather. - // TODO: support multiple reshuffled nodes. - if (UsedTEs.size() == 2) + // No 2 source vectors with the same vector factor - just choose 2 with + // max index. + if (Entries.size() == Part) { + Entries.emplace_back(std::initializer_list( + {*std::max_element(UsedTEs.front().begin(), UsedTEs.front().end(), + [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + }), + SecondEntries.front()})); + VF = std::max(Entries.back().front()->getVectorFactor(), + Entries.back().back()->getVectorFactor()); + } + } + + bool IsSplatOrUndefs = isSplat(SubVL) || all_of(SubVL, UndefValue::classof); + // Checks if the 2 PHIs are compatible in terms of high possibility to be + // vectorized. + auto AreCompatiblePHIs = [&](Value *V, Value *V1) { + auto *PHI = cast(V); + auto *PHI1 = cast(V1); + // Check that all incoming values are compatible/from same parent (if they + // are instructions). + // The incoming values are compatible if they all are constants, or + // instruction with the same/alternate opcodes from the same basic block. + for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) { + Value *In = PHI->getIncomingValue(I); + Value *In1 = PHI1->getIncomingValue(I); + if (isConstant(In) && isConstant(In1)) continue; - UsedTEs.push_back(SavedVToTEs); - Idx = UsedTEs.size() - 1; + if (!getSameOpcode({In, In1}, *TLI).getOpcode()) + return false; + if (cast(In)->getParent() != + cast(In1)->getParent()) + return false; } - UsedValuesEntry.try_emplace(V, Idx); - } - } - - if (UsedTEs.empty()) - return std::nullopt; - - unsigned VF = 0; - if (UsedTEs.size() == 1) { - // Keep the order to avoid non-determinism. - SmallVector FirstEntries(UsedTEs.front().begin(), - UsedTEs.front().end()); - sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { - return TE1->Idx < TE2->Idx; - }); - // Try to find the perfect match in another gather node at first. - auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) { - return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars); - }); - if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) { - Entries.push_back(*It); - std::iota(Mask.begin(), Mask.end(), 0); - // Clear undef scalars. - for (int I = 0, Sz = VL.size(); I < Sz; ++I) - if (isa(VL[I])) - Mask[I] = PoisonMaskElem; - return TargetTransformInfo::SK_PermuteSingleSrc; - } - // No perfect match, just shuffle, so choose the first tree node from the - // tree. - Entries.push_back(FirstEntries.front()); - } else { - // Try to find nodes with the same vector factor. - assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); - // Keep the order of tree nodes to avoid non-determinism. - DenseMap VFToTE; - for (const TreeEntry *TE : UsedTEs.front()) { - unsigned VF = TE->getVectorFactor(); - auto It = VFToTE.find(VF); - if (It != VFToTE.end()) { - if (It->second->Idx > TE->Idx) - It->getSecond() = TE; + return true; + }; + // Check if the value can be ignored during analysis for shuffled gathers. + // We suppose it is better to ignore instruction, which do not form splats, + // are not vectorized/not extractelements (these instructions will be + // handled by extractelements processing) or may form vector node in future. + auto MightBeIgnored = [=](Value *V) { + auto *I = dyn_cast(V); + SmallVector IgnoredVals; + if (UserIgnoreList) + IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) && + !isVectorLikeInstWithConstOps(I) && + !areAllUsersVectorized(I, IgnoredVals) && isSimple(I); + }; + // Check that the neighbor instruction may form a full vector node with the + // current instruction V. It is possible, if they have same/alternate opcode + // and same parent basic block. + auto NeighborMightBeIgnored = [&](Value *V, int Idx) { + Value *V1 = SubVL[Idx]; + bool UsedInSameVTE = false; + auto It = UsedValuesEntry.find(V1); + if (It != UsedValuesEntry.end()) + UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second; + return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE && + getSameOpcode({V, V1}, *TLI).getOpcode() && + cast(V)->getParent() == + cast(V1)->getParent() && + (!isa(V1) || AreCompatiblePHIs(V, V1)); + }; + // Build a shuffle mask for better cost estimation and vector emission. + SmallBitVector UsedIdxs(Entries.back().size()); + SmallVector> EntryLanes; + for (unsigned I = 0; I < SliceSize; ++I) { + Value *V = SubVL[I]; + auto It = UsedValuesEntry.find(V); + if (It == UsedValuesEntry.end()) continue; - } - VFToTE.try_emplace(VF, TE); + // Do not try to shuffle scalars, if they are constants, or instructions + // that can be vectorized as a result of the following vector build + // vectorization. + if (isConstant(V) || + (MightBeIgnored(V) && + ((I > 0 && NeighborMightBeIgnored(V, I - 1)) || + (I != SliceSize - 1 && NeighborMightBeIgnored(V, I + 1))))) + continue; + unsigned Idx = It->second; + EntryLanes.emplace_back(Idx, I); + UsedIdxs.set(Idx); + } + // Iterate through all shuffled scalars and select entries, which can be + // used for final shuffle. + SmallVector TempEntries; + for (unsigned I = 0, Sz = Entries.back().size(); I < Sz; ++I) { + if (!UsedIdxs.test(I)) + continue; + // Fix the entry number for the given scalar. If it is the first entry, + // set Pair.first to 0, otherwise to 1 (currently select at max 2 nodes). + // These indices are used when calculating final shuffle mask as the + // vector offset. + for (std::pair &Pair : EntryLanes) + if (Pair.first == I) + Pair.first = TempEntries.size(); + TempEntries.push_back(Entries.back()[I]); + } + Entries.back().swap(TempEntries); + if (EntryLanes.size() == Entries.back().size() && + !SubVL.equals( + ArrayRef(TE->Scalars) + .slice(Part * SliceSize, + std::min(SliceSize, TE->Scalars.size())))) { + // We may have here 1 or 2 entries only. If the number of scalars is equal + // to the number of entries, no need to do the analysis, it is not very + // profitable. Since SubVL is not the same as TE->Scalars, it means we + // already have some shuffles before. Cut off not profitable case. + Entries.back().clear(); + Res.push_back(std::nullopt); + continue; } - // Same, keep the order to avoid non-determinism. - SmallVector SecondEntries(UsedTEs.back().begin(), - UsedTEs.back().end()); - sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { - return TE1->Idx < TE2->Idx; - }); - for (const TreeEntry *TE : SecondEntries) { - auto It = VFToTE.find(TE->getVectorFactor()); - if (It != VFToTE.end()) { - VF = It->first; - Entries.push_back(It->second); - Entries.push_back(TE); - break; + // Build the final mask, check for the identity shuffle, if possible. + bool IsIdentity = Entries.back().size() == 1; + // Pair.first is the offset to the vector, while Pair.second is the index of + // scalar in the list. + for (const std::pair &Pair : EntryLanes) { + unsigned Idx = Part * SliceSize + Pair.second; + Mask[Idx] = + Pair.first * VF + + Entries.back()[Pair.first]->findLaneForValue(SubVL[Pair.second]); + IsIdentity &= Mask[Idx] == Pair.second; + } + switch (Entries.back().size()) { + case 1: + if (IsIdentity || EntryLanes.size() > 1 || SubVL.size() <= 2) { + Res.push_back(TTI::SK_PermuteSingleSrc); + continue; } - } - // No 2 source vectors with the same vector factor - just choose 2 with max - // index. - if (Entries.empty()) { - Entries.push_back( - *std::max_element(UsedTEs.front().begin(), UsedTEs.front().end(), - [](const TreeEntry *TE1, const TreeEntry *TE2) { - return TE1->Idx < TE2->Idx; - })); - Entries.push_back(SecondEntries.front()); - VF = std::max(Entries.front()->getVectorFactor(), - Entries.back()->getVectorFactor()); - } - } - - bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, UndefValue::classof); - // Checks if the 2 PHIs are compatible in terms of high possibility to be - // vectorized. - auto AreCompatiblePHIs = [&](Value *V, Value *V1) { - auto *PHI = cast(V); - auto *PHI1 = cast(V1); - // Check that all incoming values are compatible/from same parent (if they - // are instructions). - // The incoming values are compatible if they all are constants, or - // instruction with the same/alternate opcodes from the same basic block. - for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) { - Value *In = PHI->getIncomingValue(I); - Value *In1 = PHI1->getIncomingValue(I); - if (isConstant(In) && isConstant(In1)) + break; + case 2: + if (EntryLanes.size() > 2 || SubVL.size() <= 2) { + Res.push_back(TTI::SK_PermuteTwoSrc); continue; - if (!getSameOpcode({In, In1}, *TLI).getOpcode()) - return false; - if (cast(In)->getParent() != - cast(In1)->getParent()) - return false; + } + break; + default: + break; } - return true; - }; - // Check if the value can be ignored during analysis for shuffled gathers. - // We suppose it is better to ignore instruction, which do not form splats, - // are not vectorized/not extractelements (these instructions will be handled - // by extractelements processing) or may form vector node in future. - auto MightBeIgnored = [=](Value *V) { - auto *I = dyn_cast(V); - SmallVector IgnoredVals; - if (UserIgnoreList) - IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); - return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) && - !isVectorLikeInstWithConstOps(I) && - !areAllUsersVectorized(I, IgnoredVals) && isSimple(I); - }; - // Check that the neighbor instruction may form a full vector node with the - // current instruction V. It is possible, if they have same/alternate opcode - // and same parent basic block. - auto NeighborMightBeIgnored = [&](Value *V, int Idx) { - Value *V1 = VL[Idx]; - bool UsedInSameVTE = false; - auto It = UsedValuesEntry.find(V1); - if (It != UsedValuesEntry.end()) - UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second; - return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE && - getSameOpcode({V, V1}, *TLI).getOpcode() && - cast(V)->getParent() == - cast(V1)->getParent() && - (!isa(V1) || AreCompatiblePHIs(V, V1)); - }; - // Build a shuffle mask for better cost estimation and vector emission. - SmallBitVector UsedIdxs(Entries.size()); - SmallVector> EntryLanes; - for (int I = 0, E = VL.size(); I < E; ++I) { - Value *V = VL[I]; - auto It = UsedValuesEntry.find(V); - if (It == UsedValuesEntry.end()) - continue; - // Do not try to shuffle scalars, if they are constants, or instructions - // that can be vectorized as a result of the following vector build - // vectorization. - if (isConstant(V) || (MightBeIgnored(V) && - ((I > 0 && NeighborMightBeIgnored(V, I - 1)) || - (I != E - 1 && NeighborMightBeIgnored(V, I + 1))))) - continue; - unsigned Idx = It->second; - EntryLanes.emplace_back(Idx, I); - UsedIdxs.set(Idx); - } - // Iterate through all shuffled scalars and select entries, which can be used - // for final shuffle. - SmallVector TempEntries; - for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) { - if (!UsedIdxs.test(I)) - continue; - // Fix the entry number for the given scalar. If it is the first entry, set - // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes). - // These indices are used when calculating final shuffle mask as the vector - // offset. - for (std::pair &Pair : EntryLanes) - if (Pair.first == I) - Pair.first = TempEntries.size(); - TempEntries.push_back(Entries[I]); - } - Entries.swap(TempEntries); - if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) { - // We may have here 1 or 2 entries only. If the number of scalars is equal - // to the number of entries, no need to do the analysis, it is not very - // profitable. Since VL is not the same as TE->Scalars, it means we already - // have some shuffles before. Cut off not profitable case. - Entries.clear(); - return std::nullopt; + Entries.back().clear(); + Res.push_back(std::nullopt); } - // Build the final mask, check for the identity shuffle, if possible. - bool IsIdentity = Entries.size() == 1; - // Pair.first is the offset to the vector, while Pair.second is the index of - // scalar in the list. - for (const std::pair &Pair : EntryLanes) { - Mask[Pair.second] = Pair.first * VF + - Entries[Pair.first]->findLaneForValue(VL[Pair.second]); - IsIdentity &= Mask[Pair.second] == Pair.second; - } - switch (Entries.size()) { - case 1: - if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) - return TargetTransformInfo::SK_PermuteSingleSrc; - break; - case 2: - if (EntryLanes.size() > 2 || VL.size() <= 2) - return TargetTransformInfo::SK_PermuteTwoSrc; - break; - default: - break; + if (all_of(Res, + [](const std::optional &SK) { return !SK; })) { + Entries.clear(); + return {}; } - Entries.clear(); - return std::nullopt; + return Res; } InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, @@ -9386,9 +9527,13 @@ } /// Checks if the specified entry \p E needs to be delayed because of its /// dependency nodes. - Value *needToDelay(const TreeEntry *E, ArrayRef Deps) { + Value *needToDelay(const TreeEntry *E, + ArrayRef> Deps) { // No need to delay emission if all deps are ready. - if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; })) + if (all_of(Deps, [](ArrayRef TEs) { + return all_of( + TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; }); + })) return nullptr; // Postpone gather emission, will be emitted after the end of the // process to keep correct order. @@ -9676,7 +9821,7 @@ inversePermutation(E->ReorderIndices, ReorderMask); if (!ReorderMask.empty()) reorderScalars(GatheredScalars, ReorderMask); - auto FindReusedSplat = [&](SmallVectorImpl &Mask) { + auto FindReusedSplat = [&](MutableArrayRef Mask) { if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) { return isa(V) && !isa(V); })) @@ -9708,9 +9853,13 @@ SmallVector Mask; SmallVector ExtractMask; std::optional ExtractShuffle; - std::optional GatherShuffle; - SmallVector Entries; + SmallVector> GatherShuffle; + SmallVector> Entries; Type *ScalarTy = GatheredScalars.front()->getType(); + unsigned NumParts = TTI->getNumberOfParts( + FixedVectorType::get(ScalarTy, GatheredScalars.size())); + if (NumParts == 0 || NumParts >= GatheredScalars.size()) + NumParts = 1; if (!all_of(GatheredScalars, UndefValue::classof)) { // Check for gathered extracts. ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); @@ -9731,9 +9880,10 @@ all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) { - GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); + GatherShuffle = + isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts); } - if (GatherShuffle) { + if (!GatherShuffle.empty()) { if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) { // Delay emission of gathers which are not ready yet. PostponedGathers.insert(E); @@ -9741,10 +9891,9 @@ // process to keep correct order. return Delayed; } - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - if (*GatherShuffle == TTI::SK_PermuteSingleSrc && - Entries.front()->isSame(E->Scalars)) { + if (GatherShuffle.size() == 1 && + *GatherShuffle.front() == TTI::SK_PermuteSingleSrc && + Entries.front().front()->isSame(E->Scalars)) { // Perfect match in the graph, will reuse the previously vectorized // node. Cost is 0. LLVM_DEBUG( @@ -9758,16 +9907,19 @@ continue; } if (Mask[I] == PoisonMaskElem) - Mask[I] = Entries.front()->findLaneForValue(V); + Mask[I] = Entries.front().front()->findLaneForValue(V); } - ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); + ShuffleBuilder.add(Entries.front().front()->VectorizedValue, Mask); Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); return Res; } if (!Resized) { - unsigned VF1 = Entries.front()->getVectorFactor(); - unsigned VF2 = Entries.back()->getVectorFactor(); - if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) + if (GatheredScalars.size() != VF && + any_of(Entries, [&](ArrayRef TEs) { + return any_of(TEs, [&](const TreeEntry *TE) { + return TE->getVectorFactor() == VF; + }); + })) GatheredScalars.append(VF - GatheredScalars.size(), PoisonValue::get(ScalarTy)); } @@ -9867,9 +10019,9 @@ } } }; - if (ExtractShuffle || GatherShuffle) { + if (ExtractShuffle || !GatherShuffle.empty()) { bool IsNonPoisoned = true; - bool IsUsedInExpr = false; + bool IsUsedInExpr = true; Value *Vec1 = nullptr; if (ExtractShuffle) { // Gather of extractelements can be represented as just a shuffle of @@ -9894,31 +10046,48 @@ } } if (Vec2) { + IsUsedInExpr = false; IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2); ShuffleBuilder.add(Vec1, Vec2, ExtractMask); } else if (Vec1) { - IsUsedInExpr = FindReusedSplat(ExtractMask); + IsUsedInExpr &= FindReusedSplat(ExtractMask); ShuffleBuilder.add(Vec1, ExtractMask); IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1); } else { + IsUsedInExpr = false; ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get( ScalarTy, GatheredScalars.size())), ExtractMask); } } - if (GatherShuffle) { - if (Entries.size() == 1) { - IsUsedInExpr = FindReusedSplat(Mask); - ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); - IsNonPoisoned &= - isGuaranteedNotToBePoison(Entries.front()->VectorizedValue); - } else { - ShuffleBuilder.add(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); - IsNonPoisoned &= - isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) && - isGuaranteedNotToBePoison(Entries.back()->VectorizedValue); + if (!GatherShuffle.empty()) { + unsigned SliceSize = E->Scalars.size() / NumParts; + SmallVector VecMask(Mask.size(), PoisonMaskElem); + for (const auto [I, TEs] : enumerate(Entries)) { + if (TEs.empty()) { + assert(!GatherShuffle[I] && + "No shuffles with empty entries list expected."); + continue; + } + assert((TEs.size() == 1 || TEs.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize); + VecMask.assign(VecMask.size(), PoisonMaskElem); + copy(SubMask, std::next(VecMask.begin(), I * SliceSize)); + if (TEs.size() == 1) { + IsUsedInExpr &= FindReusedSplat(VecMask); + ShuffleBuilder.add(TEs.front()->VectorizedValue, VecMask); + IsNonPoisoned &= + isGuaranteedNotToBePoison(TEs.front()->VectorizedValue); + } else { + IsUsedInExpr = false; + ShuffleBuilder.add(TEs.front()->VectorizedValue, + TEs.back()->VectorizedValue, VecMask); + IsNonPoisoned &= + isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) && + isGuaranteedNotToBePoison(TEs.back()->VectorizedValue); + } } } // Try to figure out best way to combine values: build a shuffle and insert @@ -9929,14 +10098,18 @@ int MSz = Mask.size(); // Try to build constant vector and shuffle with it only if currently we // have a single permutation and more than 1 scalar constants. - bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle; + bool IsSingleShuffle = !ExtractShuffle || GatherShuffle.empty(); bool IsIdentityShuffle = (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) == TTI::SK_PermuteSingleSrc && none_of(ExtractMask, [&](int I) { return I >= EMSz; }) && ShuffleVectorInst::isIdentityMask(ExtractMask)) || - (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) == - TTI::SK_PermuteSingleSrc && + (!GatherShuffle.empty() && + all_of(GatherShuffle, + [](const std::optional &SK) { + return SK.value_or(TTI::SK_PermuteTwoSrc) == + TTI::SK_PermuteSingleSrc; + }) && none_of(Mask, [&](int I) { return I >= MSz; }) && ShuffleVectorInst::isIdentityMask(Mask)); bool EnoughConstsForShuffle = diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll @@ -4,49 +4,27 @@ define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[P0:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[A2:%.*]] = add i64 [[P2:%.*]], [[P2]] -; CHECK-NEXT: [[A3:%.*]] = add i64 [[P3:%.*]], [[P3]] -; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[M2:%.*]] = mul i64 [[P2]], [[P2]] -; CHECK-NEXT: [[M3:%.*]] = mul i64 [[P3]], [[P3]] -; CHECK-NEXT: [[TMP4:%.*]] = sdiv <2 x i64> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[D2:%.*]] = sdiv i64 [[P2]], [[P2]] -; CHECK-NEXT: [[D3:%.*]] = sdiv i64 [[P3]], [[P3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[S0:%.*]] = sub i64 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = sub i64 [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[S2:%.*]] = sub i64 [[M2]], [[D2]] -; CHECK-NEXT: [[S3:%.*]] = sub i64 [[M3]], [[D3]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[SHL1:%.*]] = shl i64 [[TMP9]], [[S0]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[SHL2:%.*]] = shl i64 [[TMP10]], [[S1]] -; CHECK-NEXT: [[SHL3:%.*]] = shl i64 [[A2]], [[S2]] -; CHECK-NEXT: [[SHL4:%.*]] = shl i64 [[A3]], [[S3]] -; CHECK-NEXT: [[O0:%.*]] = or i64 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TT0:%.*]] = trunc i64 [[O0]] to i32 -; CHECK-NEXT: [[O1:%.*]] = or i64 [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TT1:%.*]] = trunc i64 [[O1]] to i32 -; CHECK-NEXT: [[O2:%.*]] = or i64 [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TT2:%.*]] = trunc i64 [[O2]] to i32 -; CHECK-NEXT: [[O3:%.*]] = or i64 [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TT3:%.*]] = trunc i64 [[O3]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[P0:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i64> [[TMP11]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = trunc <4 x i64> [[TMP15]] to <4 x i32> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[PHI0:%.*]] = phi i32 [ [[T1:%.*]], [[BB]] ], [ [[TT0]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[PHI1:%.*]] = phi i32 [ [[T2:%.*]], [[BB]] ], [ [[TT1]], [[ENTRY]] ] -; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ [[T3:%.*]], [[BB]] ], [ [[TT2]], [[ENTRY]] ] -; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ [[T4:%.*]], [[BB]] ], [ [[TT3]], [[ENTRY]] ] -; CHECK-NEXT: [[T1]] = trunc i64 [[SHL1]] to i32 -; CHECK-NEXT: [[T2]] = trunc i64 [[SHL2]] to i32 -; CHECK-NEXT: [[T3]] = trunc i64 [[SHL3]] to i32 -; CHECK-NEXT: [[T4]] = trunc i64 [[SHL4]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = phi <4 x i32> [ [[TMP18:%.*]], [[BB]] ], [ [[TMP16]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP18]] = trunc <4 x i64> [[TMP8]] to <4 x i32> ; CHECK-NEXT: br label [[BB]] ; entry: