diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -533,6 +533,112 @@ return *EI->idx_begin(); } +/// Tries to find extractelement instructions with constant indices from fixed +/// vector type and gather such instructions into a bunch, which highly likely +/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was +/// successful, the matched scalars are replaced by poison values in \p VL for +/// future analysis. +static std::optional +tryToGatherExtractElements(SmallVectorImpl &VL, + SmallVectorImpl &Mask) { + // Scan list of gathered scalars for extractelements that can be represented + // as shuffles. + MapVector> VectorOpToIdx; + SmallVector UndefVectorExtracts; + for (int I = 0, E = VL.size(); I < E; ++I) { + auto *EI = dyn_cast(VL[I]); + if (!EI) + continue; + auto *VecTy = dyn_cast(EI->getVectorOperandType()); + if (!VecTy || !isa(EI->getIndexOperand())) + continue; + std::optional Idx = getExtractIndex(EI); + // Undefined index. + if (!Idx) { + UndefVectorExtracts.push_back(I); + continue; + } + SmallBitVector ExtractMask(VecTy->getNumElements(), true); + ExtractMask.reset(*Idx); + if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { + UndefVectorExtracts.push_back(I); + continue; + } + VectorOpToIdx[EI->getVectorOperand()].push_back(I); + } + // Sort the vector operands by the maximum number of uses in extractelements. + MapVector> VFToVector; + for (const auto &Data : VectorOpToIdx) + VFToVector[cast(Data.first->getType())->getNumElements()] + .push_back(Data.first); + for (auto &Data : VFToVector) { + stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) { + return VectorOpToIdx.find(V1)->second.size() > + VectorOpToIdx.find(V2)->second.size(); + }); + } + // Find the best pair of the vectors with the same number of elements or a + // single vector. + const int UndefSz = UndefVectorExtracts.size(); + unsigned SingleMax = 0; + Value *SingleVec = nullptr; + unsigned PairMax = 0; + std::pair PairVec(nullptr, nullptr); + for (auto &Data : VFToVector) { + Value *V1 = Data.second.front(); + if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) { + SingleMax = VectorOpToIdx[V1].size() + UndefSz; + SingleVec = V1; + } + Value *V2 = nullptr; + if (Data.second.size() > 1) + V2 = *std::next(Data.second.begin()); + if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + + UndefSz) { + PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz; + PairVec = std::make_pair(V1, V2); + } + } + if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) + return std::nullopt; + // Check if better to perform a shuffle of 2 vectors or just of a single + // vector. + SmallVector SavedVL(VL.begin(), VL.end()); + SmallVector GatheredExtracts( + VL.size(), PoisonValue::get(VL.front()->getType())); + if (SingleMax >= PairMax && SingleMax) { + for (int Idx : VectorOpToIdx[SingleVec]) + std::swap(GatheredExtracts[Idx], VL[Idx]); + } else { + for (Value *V : {PairVec.first, PairVec.second}) + for (int Idx : VectorOpToIdx[V]) + std::swap(GatheredExtracts[Idx], VL[Idx]); + } + // Add extracts from undefs too. + for (int Idx : UndefVectorExtracts) + std::swap(GatheredExtracts[Idx], VL[Idx]); + // Check that gather of extractelements can be represented as just a + // shuffle of a single/two vectors the scalars are extracted from. + std::optional Res = + isFixedVectorShuffle(GatheredExtracts, Mask); + if (!Res) { + // Restore the original VL if attempt was not successful. + VL.swap(SavedVL); + return std::nullopt; + } + // Restore unused scalars from mask. + for (int I = 0, E = GatheredExtracts.size(); I > E; ++I) { + auto *EI = dyn_cast(VL[I]); + if (!EI || !isa(EI->getVectorOperandType()) || + !isa(EI->getIndexOperand()) || + is_contained(UndefVectorExtracts, I)) + continue; + if (Mask[I] == UndefMaskElem) + std::swap(VL[I], GatheredExtracts[I]); + } + return Res; +} + namespace { /// Main data required for vectorization of instructions. @@ -2341,10 +2447,14 @@ /// Checks if the gathered \p VL can be represented as shuffle(s) of previous /// tree entries. + /// \param TE Tree entry checked for permutation. + /// \param VL List of scalars (a subset of the TE scalar), checked for + /// permutations. /// \returns ShuffleKind, if gathered values can be represented as shuffles of /// previous tree entries. \p Mask is filled with the shuffle mask. std::optional - isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, + isGatherShuffledEntry(const TreeEntry *TE, ArrayRef VL, + SmallVectorImpl &Mask, SmallVectorImpl &Entries); /// \returns the scalarization cost for this list of values. Assuming that @@ -6611,15 +6721,19 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); // FIXME: it tries to fix a problem with MSVC buildbots. TargetTransformInfo *TTI = this->TTI; - auto AdjustExtractsCost = [=](InstructionCost &Cost) { + auto AdjustExtractsCost = [=](InstructionCost &Cost, + ArrayRef Mask) -> Value * { + if (Mask.empty()) + return nullptr; + Value *VecBase = nullptr; // If the resulting type is scalarized, do not adjust the cost. unsigned VecNumParts = TTI->getNumberOfParts(VecTy); if (VecNumParts == VecTy->getNumElements()) - return; + return nullptr; DenseMap ExtractVectorsTys; SmallPtrSet CheckedExtracts; - for (auto *V : VL) { - if (isa(V)) + for (auto [I, V] : enumerate(VL)) { + if (isa(V) || (!Mask.empty() && Mask[I] == UndefMaskElem)) continue; // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this @@ -6633,6 +6747,7 @@ (VE && VE != E)) continue; auto *EE = cast(V); + VecBase = EE->getVectorOperand(); std::optional EEIdx = getExtractIndex(EE); if (!EEIdx) continue; @@ -6671,6 +6786,8 @@ if (TTI->getNumberOfParts(EEVTy) > VecNumParts) { unsigned Idx = (Data.second / NumElts) * NumElts; unsigned EENumElts = EEVTy->getNumElements(); + if (Idx % NumElts == 0) + continue; if (Idx + NumElts <= EENumElts) { Cost += TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, @@ -6690,17 +6807,68 @@ VecTy, std::nullopt, CostKind, 0, EEVTy); } } + return VecBase; }; if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) return 0; if (isa(VL[0])) return InstructionCost::getInvalid(); + unsigned VF = E->getVectorFactor(); + SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), + E->ReuseShuffleIndices.end()); + SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); + // Build a mask out of the reorder indices and reorder scalars per this + // mask. + SmallVector ReorderMask; + inversePermutation(E->ReorderIndices, ReorderMask); + if (!ReorderMask.empty()) + reorderScalars(GatheredScalars, ReorderMask); SmallVector Mask; + SmallVector ExtractMask; + std::optional ExtractShuffle; + std::optional GatherShuffle; SmallVector Entries; - std::optional Shuffle = - isGatherShuffledEntry(E, Mask, Entries); - if (Shuffle) { + Type *ScalarTy = GatheredScalars.front()->getType(); + // Check for gathered extracts. + ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); + SmallVector IgnoredVals; + if (UserIgnoreList) + IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + + InstructionCost Cost = 0; + Value *VecBase = AdjustExtractsCost(Cost, ExtractMask); + bool Resized = false; + if (VecBase) + if (auto *VecBaseTy = dyn_cast(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + + // Gather extracts after we check for full matched gathers only. + if (!(E->getOpcode() == Instruction::Load && !E->isAltShuffle() && + !all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) && + !isSplat(E->Scalars) && !ExtractShuffle && + (E->Scalars == GatheredScalars || GatheredScalars.size() > 2))) + GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); + if (GatherShuffle) { + assert((Entries.size() == 1 || Entries.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + if (!Resized) { + unsigned VF1 = Entries.front()->getVectorFactor(); + unsigned VF2 = Entries.back()->getVectorFactor(); + if ((VF == VF1 && GatheredScalars.size() != VF1) || + (VF == VF2 && GatheredScalars.size() != VF2)) + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != UndefMaskElem) + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } InstructionCost GatherCost = 0; if (ShuffleVectorInst::isIdentityMask(Mask)) { // Perfect match in the graph, will reuse the previously vectorized @@ -6721,33 +6889,23 @@ // previously vectorized nodes. Add the cost of the permutation rather // than gather. ::addMask(Mask, E->ReuseShuffleIndices); - GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask); + GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask); } return GatherCost; } - if ((E->getOpcode() == Instruction::ExtractElement || - all_of(E->Scalars, - [](Value *V) { - return isa(V); - })) && - allSameType(VL)) { + if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) { // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. - SmallVector Mask; - std::optional ShuffleKind = - isFixedVectorShuffle(VL, Mask); - if (ShuffleKind) { - // Found the bunch of extractelement instructions that must be gathered - // into a vector and can be represented as a permutation elements in a - // single input vector or of 2 input vectors. - InstructionCost Cost = - computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); - AdjustExtractsCost(Cost); - if (NeedToShuffleReuses) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - FinalVecTy, E->ReuseShuffleIndices); - return Cost; - } + // Found the bunch of extractelement instructions that must be gathered + // into a vector and can be represented as a permutation elements in a + // single input vector or of 2 input vectors. + InstructionCost Cost = + computeExtractCost(VL, VecTy, *ExtractShuffle, ExtractMask, *TTI); + AdjustExtractsCost(Cost, ExtractMask); + if (NeedToShuffleReuses) + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + FinalVecTy, E->ReuseShuffleIndices); + return Cost; } if (isSplat(VL)) { // Found the broadcasting of the single scalar, calculate the cost as the @@ -8071,21 +8229,88 @@ } std::optional -BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, +BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef VL, + SmallVectorImpl &Mask, SmallVectorImpl &Entries) { + Entries.clear(); + // No need to check for the topmost gather node. + if (TE == VectorizableTree.front().get()) + return std::nullopt; + Mask.assign(VL.size(), UndefMaskElem); + assert(TE->UserTreeIndices.size() == 1 && + "Expected only single user of the gather node."); // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. - Mask.assign(TE->Scalars.size(), UndefMaskElem); - Entries.clear(); + Instruction &UserInst = + getLastInstructionInBundle(TE->UserTreeIndices.front().UserTE); + auto *PHI = dyn_cast(&UserInst); + auto *NodeUI = DT->getNode( + PHI ? PHI->getIncomingBlock(TE->UserTreeIndices.front().EdgeIdx) + : UserInst.getParent()); + assert(NodeUI && "Should only process reachable instructions"); + SmallPtrSet GatheredScalars(VL.begin(), VL.end()); + auto CheckOrdering = [&](Instruction *LastEI) { + // Check if the user node of the TE comes after user node of EntryPtr, + // otherwise EntryPtr depends on TE. + // Gather nodes usually are not scheduled and inserted before their first + // user node. So, instead of checking dependency between the gather nodes + // themselves, we check the dependency between their user nodes. + // If one user node comes before the second one, we cannot use the second + // gather node as the source vector for the first gather node, because in + // the list of instructions it will be emitted later. + auto *EntryParent = LastEI->getParent(); + auto *NodeEUI = DT->getNode(EntryParent); + if (!NodeEUI) + return false; + assert((NodeUI == NodeEUI) == + (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + // Check the order of the gather nodes users. + if (UserInst.getParent() != EntryParent && + (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI))) + return false; + if (UserInst.getParent() == EntryParent && UserInst.comesBefore(LastEI)) + return false; + return true; + }; // Build a lists of values to tree entries. DenseMap> ValueToTEs; for (const std::unique_ptr &EntryPtr : VectorizableTree) { if (EntryPtr.get() == TE) - break; + continue; if (EntryPtr->State != TreeEntry::NeedToGather) continue; + if (!any_of(EntryPtr->Scalars, [&GatheredScalars](Value *V) { + return GatheredScalars.contains(V); + })) + continue; + assert(EntryPtr->UserTreeIndices.size() == 1 && + "Expected only single user of the gather node."); + Instruction &EntryUserInst = + getLastInstructionInBundle(EntryPtr->UserTreeIndices.front().UserTE); + if (&UserInst == &EntryUserInst) { + // If 2 gathers are operands of the same entry, compare operands indices, + // use the earlier one as the base. + if (TE->UserTreeIndices.front().UserTE == + EntryPtr->UserTreeIndices.front().UserTE && + TE->UserTreeIndices.front().EdgeIdx < + EntryPtr->UserTreeIndices.front().EdgeIdx) + continue; + } + // Check if the user node of the TE comes after user node of EntryPtr, + // otherwise EntryPtr depends on TE. + auto *EntryPHI = dyn_cast(&EntryUserInst); + auto *EntryI = + EntryPHI + ? EntryPHI + ->getIncomingBlock(EntryPtr->UserTreeIndices.front().EdgeIdx) + ->getTerminator() + : &EntryUserInst; + if (!CheckOrdering(EntryI)) + continue; for (Value *V : EntryPtr->Scalars) - ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get()); + if (!isConstant(V)) + ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get()); } // Find all tree entries used by the gathered values. If no common entries // found - not a shuffle. @@ -8097,7 +8322,7 @@ SmallVector> UsedTEs; DenseMap UsedValuesEntry; for (Value *V : TE->Scalars) { - if (isa(V)) + if (isConstant(V)) continue; // Build a list of tree entries where V is used. SmallPtrSet VToTEs; @@ -8107,10 +8332,11 @@ if (const TreeEntry *VTE = getTreeEntry(V)) VToTEs.insert(VTE); if (VToTEs.empty()) - return std::nullopt; + continue; if (UsedTEs.empty()) { // The first iteration, just insert the list of nodes to vector. UsedTEs.push_back(VToTEs); + UsedValuesEntry.try_emplace(V, 0); } else { // Need to check if there are any previously used tree nodes which use V. // If there are no such nodes, consider that we have another one input @@ -8135,8 +8361,9 @@ if (Idx == UsedTEs.size()) { // If the number of input vectors is greater than 2 - not a permutation, // fallback to the regular gather. + // TODO: support multiple reshuffled nodes. if (UsedTEs.size() == 2) - return std::nullopt; + continue; UsedTEs.push_back(SavedVToTEs); Idx = UsedTEs.size() - 1; } @@ -8144,32 +8371,55 @@ } } - if (UsedTEs.empty()) { - assert(all_of(TE->Scalars, UndefValue::classof) && - "Expected vector of undefs only."); + if (UsedTEs.empty()) return std::nullopt; - } unsigned VF = 0; if (UsedTEs.size() == 1) { + // Keep the order to avoid non-determinism. + SmallVector FirstEntries(UsedTEs.front().begin(), + UsedTEs.front().end()); + sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + }); // Try to find the perfect match in another gather node at first. - auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) { - return EntryPtr->isSame(TE->Scalars); + auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) { + return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars); }); - if (It != UsedTEs.front().end()) { + if (It != FirstEntries.end()) { Entries.push_back(*It); std::iota(Mask.begin(), Mask.end(), 0); + // Clear undef scalars. + for (int I = 0, Sz = VL.size(); I < Sz; ++I) + if (isa(TE->Scalars[I])) + Mask[I] = UndefMaskElem; return TargetTransformInfo::SK_PermuteSingleSrc; } - // No perfect match, just shuffle, so choose the first tree node. - Entries.push_back(*UsedTEs.front().begin()); + // No perfect match, just shuffle, so choose the first tree node from the + // tree. + Entries.push_back(FirstEntries.front()); } else { // Try to find nodes with the same vector factor. assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); + // Keep the order of tree nodes to avoid non-determinism. DenseMap VFToTE; - for (const TreeEntry *TE : UsedTEs.front()) - VFToTE.try_emplace(TE->getVectorFactor(), TE); - for (const TreeEntry *TE : UsedTEs.back()) { + for (const TreeEntry *TE : UsedTEs.front()) { + unsigned VF = TE->getVectorFactor(); + auto It = VFToTE.find(VF); + if (It != VFToTE.end()) { + if (It->second->Idx > TE->Idx) + It->getSecond() = TE; + continue; + } + VFToTE.try_emplace(VF, TE); + } + // Same, keep the order to avoid non-determinism. + SmallVector SecondEntries(UsedTEs.back().begin(), + UsedTEs.back().end()); + sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) { + return TE1->Idx < TE2->Idx; + }); + for (const TreeEntry *TE : SecondEntries) { auto It = VFToTE.find(TE->getVectorFactor()); if (It != VFToTE.end()) { VF = It->first; @@ -8184,28 +8434,131 @@ return std::nullopt; } + Value *SingleV = nullptr; + bool IsSplat = all_of(VL, [&SingleV](Value *V) { + if (!isa(V)) { + if (!SingleV) + SingleV = V; + return SingleV == V; + }; + return true; + }); + // Checks if the 2 PHIs are compatible in terms of high possibility to be + // vectorized. + auto AreCompatiblePHIs = [&](Value *V, Value *V1) { + auto *PHI = cast(V); + auto *PHI1 = cast(V1); + // Check that all incoming values are compatible/from same parent (if they + // are instructions). + // The incoming values are compatible if they all are constants, or + // instruction with the same/alternate opcodes from the same basic block. + for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) { + Value *In = PHI->getIncomingValue(I); + Value *In1 = PHI1->getIncomingValue(I); + if (isConstant(In) && isConstant(In1)) + continue; + if (!getSameOpcode({In, In1}, *TLI).getOpcode()) + return false; + if (cast(In)->getParent() != + cast(In1)->getParent()) + return false; + } + return true; + }; + // Check if the value can be ignored during analysis for shuffled gathers. + // We suppose it is better to ignore instruction, which do not form splats, + // are not vectorized/not extractelements (these instructions will be handled + // by extractelements processing) or may form vector node in future. + auto MightBeIgnored = [=](Value *V) { + auto *I = dyn_cast(V); + SmallVector IgnoredVals; + if (UserIgnoreList) + IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + return I && !IsSplat && !ScalarToTreeEntry.count(I) && + !isVectorLikeInstWithConstOps(I) && + !areAllUsersVectorized(I, IgnoredVals) && isSimple(I); + }; + // Check that the neighbor instruction may form a full vector node with the + // current instruction V. It is possible, if they have same/alternate opcode + // and same parent basic block. + auto NeighborMightBeIgnored = [&](Value *V, int Idx) { + Value *V1 = VL[Idx]; + // Check if 2 values won't be vectorized in the same vector node. + bool UsedInSameVTE = false; + auto It = UsedValuesEntry.find(V1); + if (It != UsedValuesEntry.end()) + UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second; + return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE && + getSameOpcode({V, V1}, *TLI).getOpcode() && + cast(V)->getParent() == + cast(V1)->getParent() && + (!isa(V1) || AreCompatiblePHIs(V, V1)); + }; // Build a shuffle mask for better cost estimation and vector emission. - for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { - Value *V = TE->Scalars[I]; - if (isa(V)) + SmallBitVector UsedIdxs(Entries.size()); + SmallVector> EntryLanes; + for (int I = 0, E = VL.size(); I < E; ++I) { + Value *V = VL[I]; + auto It = UsedValuesEntry.find(V); + if (It == UsedValuesEntry.end()) continue; - unsigned Idx = UsedValuesEntry.lookup(V); - const TreeEntry *VTE = Entries[Idx]; - int FoundLane = VTE->findLaneForValue(V); - Mask[I] = Idx * VF + FoundLane; - // Extra check required by isSingleSourceMaskImpl function (called by - // ShuffleVectorInst::isSingleSourceMask). - if (Mask[I] >= 2 * E) - return std::nullopt; + // Do not try to shuffle scalars, if they are constants, or instructions + // that can be vectorized as a result of the following vector build + // vectorization. + if (isConstant(V) || (MightBeIgnored(V) && + ((I > 0 && NeighborMightBeIgnored(V, I - 1)) || + (I != E - 1 && NeighborMightBeIgnored(V, I + 1))))) + continue; + unsigned Idx = It->second; + EntryLanes.emplace_back(Idx, I); + UsedIdxs.set(Idx); + } + // Iterate through all shuffled scalars and select entries, which can be used + // for final shuffle. + SmallVector TempEntries; + for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) { + if (!UsedIdxs.test(I)) + continue; + // Fix the entry number for the given scalar. If it the first entry, set + // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes). + // These indices are used when calculating final shuffle mask as the vector + // offset. + for (std::pair &Pair : EntryLanes) + if (Pair.first == I) + Pair.first = TempEntries.size(); + TempEntries.push_back(Entries[I]); + } + Entries.swap(TempEntries); + if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) { + // We may have here 1 or 2 entries only. If the number of scalars is equal + // to the number of entries, no need to do the analysis, it is not very + // profitable. Since VL is not the same as TE->Scalars, it means we already + // have some shuffles before. Cut off not profitable case. + Entries.clear(); + return std::nullopt; + } + // Build the final mask, check for the identity shuffle, if possible. + bool IsIdentity = Entries.size() == 1; + // Pair.first is the offset to the vector, while Pair.second is the index of + // scalar in the list. + for (const std::pair &Pair : EntryLanes) { + Mask[Pair.second] = Pair.first * VF + + Entries[Pair.first]->findLaneForValue(VL[Pair.second]); + IsIdentity &= Mask[Pair.second] == Pair.second; } switch (Entries.size()) { case 1: - return TargetTransformInfo::SK_PermuteSingleSrc; + if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) + return TargetTransformInfo::SK_PermuteSingleSrc; + break; case 2: - return TargetTransformInfo::SK_PermuteTwoSrc; + if (EntryLanes.size() > 2 || VL.size() <= 2) + return TargetTransformInfo::SK_PermuteTwoSrc; + break; default: break; } + Entries.clear(); return std::nullopt; } @@ -8966,19 +9319,152 @@ } if (E->getMainOp()) setInsertPointAfterBundle(E); - Value *Vec; + unsigned VF = E->getVectorFactor(); + auto AdjustExtracts = [&](const TreeEntry *E, ArrayRef Mask) { + Value *VecBase = nullptr; + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + int Idx = Mask[I]; + if (Idx == UndefMaskElem) + continue; + auto *EI = cast(E->Scalars[I]); + VecBase = EI->getVectorOperand(); + // If all users are vectorized - can delete the extractelement + // itself. + if (any_of(EI->users(), + [&](User *U) { return !ScalarToTreeEntry.count(U); })) + continue; + eraseInstruction(EI); + } + return VecBase; + }; + auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef Mask) { + if (V1->getType() != V2->getType()) { + unsigned VF1 = cast(V1->getType())->getNumElements(); + unsigned VF2 = cast(V2->getType())->getNumElements(); + unsigned VF = std::max(VF1, VF2); + SmallVector ExtMask(VF, UndefMaskElem); + std::iota(ExtMask.begin(), + std::next(ExtMask.begin(), std::min(VF1, VF2)), 0); + if (VF1 < VF2) { + V1 = Builder.CreateShuffleVector(V1, ExtMask); + if (auto *I = dyn_cast(V1)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } else { + V2 = Builder.CreateShuffleVector(V2, ExtMask); + if (auto *I = dyn_cast(V2)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + } + } + Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask); + if (auto *I = dyn_cast(Vec)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + return Vec; + }; + + SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), + E->ReuseShuffleIndices.end()); + SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); + // Build a mask out of the reorder indices and reorder scalars per this + // mask. + SmallVector ReorderMask; + inversePermutation(E->ReorderIndices, ReorderMask); + if (!ReorderMask.empty()) + reorderScalars(GatheredScalars, ReorderMask); + Value *Vec = nullptr; SmallVector Mask; + SmallVector ExtractMask; + std::optional ExtractShuffle; + std::optional GatherShuffle; SmallVector Entries; - std::optional Shuffle = - isGatherShuffledEntry(E, Mask, Entries); - if (Shuffle) { + Type *ScalarTy = GatheredScalars.front()->getType(); + // Check for gathered extracts. + ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); + SmallVector IgnoredVals; + if (UserIgnoreList) + IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + Value *VecBase = AdjustExtracts(E, ExtractMask); + bool Resized = false; + if (VecBase) + if (auto *VecBaseTy = dyn_cast(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + // Gather extracts after we check for full matched gathers only. + if (!(E->getOpcode() == Instruction::Load && !E->isAltShuffle() && + !all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) && + !isSplat(E->Scalars) && + (E->Scalars == GatheredScalars || GatheredScalars.size() > 2))) + GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); + if (GatherShuffle) { assert((Entries.size() == 1 || Entries.size() == 2) && "Expected shuffle of 1 or 2 entries."); - Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); - if (auto *I = dyn_cast(Vec)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); + if (!Resized) { + unsigned VF1 = Entries.front()->getVectorFactor(); + unsigned VF2 = Entries.back()->getVectorFactor(); + if ((VF == VF1 && GatheredScalars.size() != VF1) || + (VF == VF2 && GatheredScalars.size() != VF2)) + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + // Remove shuffled elements from list of gathers. + for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { + if (Mask[I] != UndefMaskElem) + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } + } + if ((ExtractShuffle || GatherShuffle) && + all_of(GatheredScalars, PoisonValue::classof)) { + Value *Vec1 = nullptr; + if (ExtractShuffle) { + // Gather of extractelements can be represented as just a shuffle of + // a single/two vectors the scalars are extracted from. + // Find input vectors. + Value *Vec2 = nullptr; + for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { + if (ExtractMask[I] == UndefMaskElem || + (!Mask.empty() && Mask[I] != UndefMaskElem)) { + ExtractMask[I] = UndefMaskElem; + continue; + } + if (isa(E->Scalars[I])) + continue; + auto *EI = cast(E->Scalars[I]); + if (!Vec1) { + Vec1 = EI->getVectorOperand(); + } else if (Vec1 != EI->getVectorOperand()) { + assert((!Vec2 || Vec2 == EI->getVectorOperand()) && + "Expected only 1 or 2 vectors shuffle."); + Vec2 = EI->getVectorOperand(); + } + } + if (Vec2) + Vec1 = CreateShuffle(Vec1, Vec2, ExtractMask); + else if (Vec1) + Vec1 = CreateShuffle(Vec1, Vec1, ExtractMask); + } + if (GatherShuffle) { + Vec = CreateShuffle(Entries.front()->VectorizedValue, + Entries.back()->VectorizedValue, Mask); + if (Vec1) { + // Build final mask. + for (auto [I, Idx] : enumerate(Mask)) { + if (ExtractMask[I] != UndefMaskElem) + Idx = I; + else if (Idx != UndefMaskElem) + Idx = I + VF; + } + Vec = CreateShuffle(Vec1, Vec, Mask); + } + } else { + Vec = Vec1; } } else { Vec = gather(E->Scalars); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -53,55 +53,3 @@ %tmp22 = add i32 %tmp21, %tmp19 ret i32 %tmp22 } - -@data = global [6 x [258 x i8]] zeroinitializer, align 1 -define void @gather_load(ptr noalias %ptr) { -; CHECK-LABEL: @gather_load( -; CHECK-NEXT: [[ARRAYIDX182:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX183:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 2 -; CHECK-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 3 -; CHECK-NEXT: [[ARRAYIDX185:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 4 -; CHECK-NEXT: [[L0:%.*]] = load i8, ptr getelementptr inbounds ([6 x [258 x i8]], ptr @data, i64 0, i64 1, i64 0), align 1 -; CHECK-NEXT: [[CONV150:%.*]] = zext i8 [[L0]] to i16 -; CHECK-NEXT: [[ADD152:%.*]] = add nuw nsw i16 [[CONV150]], 10 -; CHECK-NEXT: [[L1:%.*]] = load i8, ptr getelementptr inbounds ([6 x [258 x i8]], ptr @data, i64 0, i64 2, i64 1), align 1 -; CHECK-NEXT: [[CONV156:%.*]] = zext i8 [[L1]] to i16 -; CHECK-NEXT: [[ADD158:%.*]] = add nuw nsw i16 [[CONV156]], 20 -; CHECK-NEXT: [[L2:%.*]] = load i8, ptr getelementptr inbounds ([6 x [258 x i8]], ptr @data, i64 0, i64 3, i64 2), align 1 -; CHECK-NEXT: [[CONV162:%.*]] = zext i8 [[L2]] to i16 -; CHECK-NEXT: [[ADD164:%.*]] = add nuw nsw i16 [[CONV162]], 30 -; CHECK-NEXT: [[L3:%.*]] = load i8, ptr getelementptr inbounds ([6 x [258 x i8]], ptr @data, i64 0, i64 4, i64 3), align 1 -; CHECK-NEXT: [[CONV168:%.*]] = zext i8 [[L3]] to i16 -; CHECK-NEXT: [[ADD170:%.*]] = add nuw nsw i16 [[CONV168]], 40 -; CHECK-NEXT: store i16 [[ADD152]], ptr [[ARRAYIDX182]], align 2 -; CHECK-NEXT: store i16 [[ADD158]], ptr [[ARRAYIDX183]], align 2 -; CHECK-NEXT: store i16 [[ADD164]], ptr [[ARRAYIDX184]], align 2 -; CHECK-NEXT: store i16 [[ADD170]], ptr [[ARRAYIDX185]], align 2 -; CHECK-NEXT: ret void -; - %arrayidx182 = getelementptr inbounds i16, ptr %ptr, i64 1 - %arrayidx183 = getelementptr inbounds i16, ptr %ptr, i64 2 - %arrayidx184 = getelementptr inbounds i16, ptr %ptr, i64 3 - %arrayidx185 = getelementptr inbounds i16, ptr %ptr, i64 4 - %arrayidx149 = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 1, i64 0 - %l0 = load i8, ptr %arrayidx149, align 1 - %conv150 = zext i8 %l0 to i16 - %add152 = add i16 10, %conv150 - %arrayidx155 = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 2, i64 1 - %l1 = load i8, ptr %arrayidx155, align 1 - %conv156 = zext i8 %l1 to i16 - %add158 = add i16 20, %conv156 - %arrayidx161 = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 3, i64 2 - %l2 = load i8, ptr %arrayidx161, align 1 - %conv162 = zext i8 %l2 to i16 - %add164 = add i16 30, %conv162 - %arrayidx167 = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 4, i64 3 - %l3 = load i8, ptr %arrayidx167, align 1 - %conv168 = zext i8 %l3 to i16 - %add170 = add i16 40, %conv168 - store i16 %add152, ptr %arrayidx182, align 2 - store i16 %add158, ptr %arrayidx183, align 2 - store i16 %add164, ptr %arrayidx184, align 2 - store i16 %add170, ptr %arrayidx185, align 2 - ret void -} diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -slp-threshold=-3 -S -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-threshold=-2 -S -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-4 | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-6 -slp-min-tree-size=5 | FileCheck %s --check-prefix=FORCE_REDUCTION +; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-4 -slp-min-tree-size=5 | FileCheck %s --check-prefix=FORCE_REDUCTION define void @Test(i32) { ; CHECK-LABEL: @Test( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll @@ -110,18 +110,15 @@ define i8 @k(<4 x i8> %x) { ; CHECK-LABEL: @k( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i64 0 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i64 3 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i64 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i64 2 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]] -; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]] -; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]] -; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] ; %x0 = extractelement <4 x i8> %x, i32 0 %x3 = extractelement <4 x i8> %x, i32 3 @@ -141,18 +138,15 @@ ; CHECK-LABEL: @k_bb( ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i64 0 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i64 3 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i64 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i64 2 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]] -; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]] -; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]] -; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] ; %x0 = extractelement <4 x i8> %x, i32 0 br label %bb1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll @@ -110,18 +110,15 @@ define i8 @k(<4 x i8> %x) { ; CHECK-LABEL: @k( -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i64 0 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i64 3 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i64 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i64 2 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]] -; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]] -; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]] -; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] ; %x0 = extractelement <4 x i8> %x, i32 0 %x3 = extractelement <4 x i8> %x, i32 3 @@ -141,18 +138,15 @@ ; CHECK-LABEL: @k_bb( ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i64 0 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i64 3 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i64 1 -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i64 2 -; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] -; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]] -; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]] -; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]] -; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] ; %x0 = extractelement <4 x i8> %x, i32 0 br label %bb1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -18,21 +18,21 @@ ; SSE-LABEL: @splat( ; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 ; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 -; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> -; SSE-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 -; SSE-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer -; SSE-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]] -; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @cle, align 16 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> +; SSE-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> zeroinitializer +; SSE-NEXT: [[TMP6:%.*]] = xor <16 x i8> [[TMP3]], [[TMP5]] +; SSE-NEXT: store <16 x i8> [[TMP6]], ptr @cle, align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @splat( ; AVX-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 ; AVX-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> -; AVX-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 -; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> zeroinitializer -; AVX-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]] -; AVX-NEXT: store <16 x i8> [[TMP4]], ptr @cle, align 16 +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> +; AVX-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = xor <16 x i8> [[TMP3]], [[TMP5]] +; AVX-NEXT: store <16 x i8> [[TMP6]], ptr @cle, align 16 ; AVX-NEXT: ret void ; %1 = xor i8 %c, %a @@ -75,31 +75,29 @@ define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) { ; SSE-LABEL: @same_opcode_on_one_side( -; SSE-NEXT: [[ADD1:%.*]] = add i32 [[C:%.*]], [[A:%.*]] -; SSE-NEXT: [[ADD2:%.*]] = add i32 [[C]], [[A]] -; SSE-NEXT: [[ADD3:%.*]] = add i32 [[A]], [[C]] -; SSE-NEXT: [[ADD4:%.*]] = add i32 [[C]], [[A]] -; SSE-NEXT: [[TMP1:%.*]] = xor i32 [[ADD1]], [[A]] -; SSE-NEXT: store i32 [[TMP1]], ptr @cle32, align 16 -; SSE-NEXT: [[TMP2:%.*]] = xor i32 [[B:%.*]], [[ADD2]] -; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([32 x i32], ptr @cle32, i64 0, i64 1), align 4 -; SSE-NEXT: [[TMP3:%.*]] = xor i32 [[C]], [[ADD3]] -; SSE-NEXT: store i32 [[TMP3]], ptr getelementptr inbounds ([32 x i32], ptr @cle32, i64 0, i64 2), align 4 -; SSE-NEXT: [[TMP4:%.*]] = xor i32 [[A]], [[ADD4]] -; SSE-NEXT: store i32 [[TMP4]], ptr getelementptr inbounds ([32 x i32], ptr @cle32, i64 0, i64 3), align 4 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer +; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[B:%.*]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[C]], i32 2 +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> +; SSE-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP5]], [[TMP8]] +; SSE-NEXT: store <4 x i32> [[TMP9]], ptr @cle32, align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @same_opcode_on_one_side( ; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer -; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0 -; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; AVX-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 1 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[C]], i32 2 -; AVX-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> -; AVX-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[SHUFFLE2]] -; AVX-NEXT: store <4 x i32> [[TMP6]], ptr @cle32, align 16 +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer +; AVX-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[B:%.*]], i32 1 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[C]], i32 2 +; AVX-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> +; AVX-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP5]], [[TMP8]] +; AVX-NEXT: store <4 x i32> [[TMP9]], ptr @cle32, align 16 ; AVX-NEXT: ret void ; %add1 = add i32 %c, %a diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll @@ -15,15 +15,15 @@ define i32 @fn1() { ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @b, align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr getelementptr inbounds ([[STRUCT_DSTATE:%.*]], ptr @b, i32 0, i32 1), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @d, align 4 -; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP2]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr @b, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @d, align 4 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[COND]], label [[SW_BB:%.*]], label [[SAVE_STATE_AND_RETURN:%.*]] ; CHECK: sw.bb: -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr @c, align 4 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP3]], 7 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @c, align 4 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP2]], 7 ; CHECK-NEXT: store i32 [[AND]], ptr @a, align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> , <2 x i32> [[TMP0]], <2 x i32> ; CHECK-NEXT: switch i32 [[AND]], label [[IF_END:%.*]] [ ; CHECK-NEXT: i32 7, label [[SAVE_STATE_AND_RETURN]] ; CHECK-NEXT: i32 0, label [[SAVE_STATE_AND_RETURN]] @@ -31,10 +31,8 @@ ; CHECK: if.end: ; CHECK-NEXT: br label [[SAVE_STATE_AND_RETURN]] ; CHECK: save_state_and_return: -; CHECK-NEXT: [[T_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP0]], [[SW_BB]] ], [ [[TMP0]], [[SW_BB]] ] -; CHECK-NEXT: [[F_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[TMP1]], [[ENTRY]] ], [ 0, [[SW_BB]] ], [ 0, [[SW_BB]] ] -; CHECK-NEXT: store i32 [[T_0]], ptr @b, align 4 -; CHECK-NEXT: store i32 [[F_0]], ptr getelementptr inbounds ([[STRUCT_DSTATE]], ptr @b, i32 0, i32 1), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[IF_END]] ], [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP3]], [[SW_BB]] ], [ [[TMP3]], [[SW_BB]] ] +; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr @b, align 4 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll @@ -6,37 +6,35 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[A:%.*]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <2 x i32> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1 -; CHECK-NEXT: [[SHUFFLE15:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHUFFLE15]]) -; CHECK-NEXT: [[OP_RDX16:%.*]] = add i32 [[TMP6]], 0 -; CHECK-NEXT: [[OP_RDX17:%.*]] = add i32 [[OP_RDX16]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[TMP6]], 0 +; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], 0 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX17]], [[BB1]] ], [ 0, [[BB2:%.*]] ] +; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX15]], [[BB1]] ], [ 0, [[BB2:%.*]] ] ; CHECK-NEXT: ret i32 0 ; CHECK: bb4: -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[SHUFFLE10:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE10]] -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 -; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[OP_RDX13]], [[TMP2]] -; CHECK-NEXT: ret i32 [[OP_RDX14]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP2]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1 +; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[OP_RDX12]], [[TMP3]] +; CHECK-NEXT: ret i32 [[OP_RDX13]] ; CHECK: bb5: ; CHECK-NEXT: br label [[BB4:%.*]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll @@ -146,16 +146,14 @@ ; MINTREESIZE-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 ; MINTREESIZE-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; MINTREESIZE-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q1]], i32 1 +; MINTREESIZE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> [[RD]], <2 x i32> ; MINTREESIZE-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 ; MINTREESIZE-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q3]], i32 1 +; MINTREESIZE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> [[RD]], <2 x i32> ; MINTREESIZE-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]] ; MINTREESIZE-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]] -; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1 +; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 +; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -180,16 +180,14 @@ ; MINTREESIZE-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 ; MINTREESIZE-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; MINTREESIZE-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 -; MINTREESIZE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[Q0]], i32 0 -; MINTREESIZE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[Q1]], i32 1 +; MINTREESIZE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> [[RD]], <2 x i32> ; MINTREESIZE-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 ; MINTREESIZE-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3 -; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q2]], i32 0 -; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q3]], i32 1 +; MINTREESIZE-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> [[RD]], <2 x i32> ; MINTREESIZE-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]] ; MINTREESIZE-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]] -; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 -; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1 +; MINTREESIZE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 +; MINTREESIZE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll @@ -18,41 +18,11 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { ; SSE2-LABEL: @reduce_and4( ; SSE2-NEXT: entry: -; SSE2-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1 -; SSE2-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2 -; SSE2-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3 -; SSE2-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 -; SSE2-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2 -; SSE2-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3 -; SSE2-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT8]], i32 0 -; SSE2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[VECEXT7]], i32 1 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT10]], i32 2 -; SSE2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT12]], i32 3 -; SSE2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 4 -; SSE2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT]], i32 5 -; SSE2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT2]], i32 6 -; SSE2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT4]], i32 7 -; SSE2-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V3]], i64 1 -; SSE2-NEXT: [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2 -; SSE2-NEXT: [[VECEXT20:%.*]] = extractelement <4 x i32> [[V3]], i64 3 -; SSE2-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V4]], i64 1 -; SSE2-NEXT: [[VECEXT26:%.*]] = extractelement <4 x i32> [[V4]], i64 2 -; SSE2-NEXT: [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3 -; SSE2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0 -; SSE2-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[VECEXT23]], i32 1 -; SSE2-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT26]], i32 2 -; SSE2-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT28]], i32 3 -; SSE2-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT16]], i32 4 -; SSE2-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT15]], i32 5 -; SSE2-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 6 -; SSE2-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT20]], i32 7 -; SSE2-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP15]]) -; SSE2-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP7]]) -; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP16]], [[TMP17]] +; SSE2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]] ; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; SSE2-NEXT: ret i32 [[OP_RDX1]] ; @@ -70,41 +40,11 @@ ; ; AVX-LABEL: @reduce_and4( ; AVX-NEXT: entry: -; AVX-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 -; AVX-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1 -; AVX-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2 -; AVX-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3 -; AVX-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 -; AVX-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 -; AVX-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2 -; AVX-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3 -; AVX-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT8]], i32 0 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[VECEXT7]], i32 1 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT10]], i32 2 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT12]], i32 3 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 4 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT]], i32 5 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT2]], i32 6 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT4]], i32 7 -; AVX-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0 -; AVX-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V3]], i64 1 -; AVX-NEXT: [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2 -; AVX-NEXT: [[VECEXT20:%.*]] = extractelement <4 x i32> [[V3]], i64 3 -; AVX-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0 -; AVX-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V4]], i64 1 -; AVX-NEXT: [[VECEXT26:%.*]] = extractelement <4 x i32> [[V4]], i64 2 -; AVX-NEXT: [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0 -; AVX-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[VECEXT23]], i32 1 -; AVX-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT26]], i32 2 -; AVX-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT28]], i32 3 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT16]], i32 4 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT15]], i32 5 -; AVX-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 6 -; AVX-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT20]], i32 7 -; AVX-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP15]]) -; AVX-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP7]]) -; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP16]], [[TMP17]] +; AVX-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) +; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]] ; AVX-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; AVX-NEXT: ret i32 [[OP_RDX1]] ; @@ -154,41 +94,11 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { ; SSE2-LABEL: @reduce_and4_transpose( -; SSE2-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0 -; SSE2-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1 -; SSE2-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 -; SSE2-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V3]], i64 1 -; SSE2-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V4]], i64 1 -; SSE2-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2 -; SSE2-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2 -; SSE2-NEXT: [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2 -; SSE2-NEXT: [[VECEXT20:%.*]] = extractelement <4 x i32> [[V4]], i64 2 -; SSE2-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3 -; SSE2-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT16]], i32 1 -; SSE2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT8]], i32 2 -; SSE2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 3 -; SSE2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT23]], i32 4 -; SSE2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT15]], i32 5 -; SSE2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT7]], i32 6 -; SSE2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[VECEXT]], i32 7 -; SSE2-NEXT: [[VECEXT26:%.*]] = extractelement <4 x i32> [[V3]], i64 3 -; SSE2-NEXT: [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3 -; SSE2-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT28]], i32 0 -; SSE2-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT20]], i32 1 -; SSE2-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT12]], i32 2 -; SSE2-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT4]], i32 3 -; SSE2-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT26]], i32 4 -; SSE2-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 5 -; SSE2-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT10]], i32 6 -; SSE2-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[VECEXT2]], i32 7 -; SSE2-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP16]]) -; SSE2-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]]) -; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP17]], [[TMP18]] +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) +; SSE2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]] ; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; SSE2-NEXT: ret i32 [[OP_RDX1]] ; @@ -204,41 +114,11 @@ ; SSE42-NEXT: ret i32 [[OP_RDX3]] ; ; AVX-LABEL: @reduce_and4_transpose( -; AVX-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 -; AVX-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0 -; AVX-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V3:%.*]], i64 0 -; AVX-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V4:%.*]], i64 0 -; AVX-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1 -; AVX-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1 -; AVX-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V3]], i64 1 -; AVX-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V4]], i64 1 -; AVX-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2 -; AVX-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2 -; AVX-NEXT: [[VECEXT18:%.*]] = extractelement <4 x i32> [[V3]], i64 2 -; AVX-NEXT: [[VECEXT20:%.*]] = extractelement <4 x i32> [[V4]], i64 2 -; AVX-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3 -; AVX-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT24]], i32 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[VECEXT16]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[VECEXT8]], i32 2 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[VECEXT1]], i32 3 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[VECEXT23]], i32 4 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[VECEXT15]], i32 5 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[VECEXT7]], i32 6 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[VECEXT]], i32 7 -; AVX-NEXT: [[VECEXT26:%.*]] = extractelement <4 x i32> [[V3]], i64 3 -; AVX-NEXT: [[VECEXT28:%.*]] = extractelement <4 x i32> [[V4]], i64 3 -; AVX-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> poison, i32 [[VECEXT28]], i32 0 -; AVX-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[VECEXT20]], i32 1 -; AVX-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[VECEXT12]], i32 2 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[VECEXT4]], i32 3 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[VECEXT26]], i32 4 -; AVX-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[VECEXT18]], i32 5 -; AVX-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[VECEXT10]], i32 6 -; AVX-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[VECEXT2]], i32 7 -; AVX-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP16]]) -; AVX-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]]) -; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP17]], [[TMP18]] +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]] ; AVX-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; AVX-NEXT: ret i32 [[OP_RDX1]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll @@ -18,7 +18,7 @@ ; YAML-NEXT: Function: fextr ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-20' +; YAML-NEXT: - Cost: '-19' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '4'