diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -544,110 +544,127 @@ /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was /// successful, the matched scalars are replaced by poison values in \p VL for /// future analysis. -static std::optional +static SmallVector> tryToGatherExtractElements(SmallVectorImpl &VL, - SmallVectorImpl &Mask) { - // Scan list of gathered scalars for extractelements that can be represented - // as shuffles. - MapVector> VectorOpToIdx; - SmallVector UndefVectorExtracts; - for (int I = 0, E = VL.size(); I < E; ++I) { - auto *EI = dyn_cast(VL[I]); - if (!EI) { - if (isa(VL[I])) + SmallVectorImpl &Mask, unsigned NumParts) { + assert(NumParts > 0 && "NumParts expected be greater than or equal to 1."); + SmallVector> ShufflesRes(NumParts); + Mask.assign(VL.size(), PoisonMaskElem); + unsigned SliceSize = VL.size() / NumParts; + bool AtLeastOneFound = false; + for (unsigned Part = 0; Part < NumParts; ++Part) { + // Scan list of gathered scalars for extractelements that can be represented + // as shuffles. + MutableArrayRef SubVL = + MutableArrayRef(VL).slice(Part * SliceSize, SliceSize); + MapVector> VectorOpToIdx; + SmallVector UndefVectorExtracts; + for (int I = 0, E = SubVL.size(); I < E; ++I) { + auto *EI = dyn_cast(SubVL[I]); + if (!EI) { + if (isa(SubVL[I])) + UndefVectorExtracts.push_back(I); + continue; + } + auto *VecTy = dyn_cast(EI->getVectorOperandType()); + if (!VecTy || !isa(EI->getIndexOperand())) + continue; + std::optional Idx = getExtractIndex(EI); + // Undefined index. + if (!Idx) { UndefVectorExtracts.push_back(I); - continue; + continue; + } + SmallBitVector ExtractMask(VecTy->getNumElements(), true); + ExtractMask.reset(*Idx); + if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { + UndefVectorExtracts.push_back(I); + continue; + } + VectorOpToIdx[EI->getVectorOperand()].push_back(I); + } + // Sort the vector operands by the maximum number of uses in + // extractelements. + MapVector> VFToVector; + for (const auto &Data : VectorOpToIdx) + VFToVector[cast(Data.first->getType())->getNumElements()] + .push_back(Data.first); + for (auto &Data : VFToVector) { + stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) { + return VectorOpToIdx.find(V1)->second.size() > + VectorOpToIdx.find(V2)->second.size(); + }); } - auto *VecTy = dyn_cast(EI->getVectorOperandType()); - if (!VecTy || !isa(EI->getIndexOperand())) - continue; - std::optional Idx = getExtractIndex(EI); - // Undefined index. - if (!Idx) { - UndefVectorExtracts.push_back(I); + // Find the best pair of the vectors with the same number of elements or a + // single vector. + const int UndefSz = UndefVectorExtracts.size(); + unsigned SingleMax = 0; + Value *SingleVec = nullptr; + unsigned PairMax = 0; + std::pair PairVec(nullptr, nullptr); + for (auto &Data : VFToVector) { + Value *V1 = Data.second.front(); + if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) { + SingleMax = VectorOpToIdx[V1].size() + UndefSz; + SingleVec = V1; + } + Value *V2 = nullptr; + if (Data.second.size() > 1) + V2 = *std::next(Data.second.begin()); + if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + + UndefSz) { + PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz; + PairVec = std::make_pair(V1, V2); + } + } + if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) continue; + // Check if better to perform a shuffle of 2 vectors or just of a single + // vector. + SmallVector SavedVL(SubVL.begin(), SubVL.end()); + SmallVector GatheredExtracts( + SubVL.size(), PoisonValue::get(SubVL.front()->getType())); + if (SingleMax >= PairMax && SingleMax) { + for (int Idx : VectorOpToIdx[SingleVec]) + std::swap(GatheredExtracts[Idx], SubVL[Idx]); + } else { + for (Value *V : {PairVec.first, PairVec.second}) + for (int Idx : VectorOpToIdx[V]) + std::swap(GatheredExtracts[Idx], SubVL[Idx]); } - SmallBitVector ExtractMask(VecTy->getNumElements(), true); - ExtractMask.reset(*Idx); - if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { - UndefVectorExtracts.push_back(I); + // Add extracts from undefs too. + for (int Idx : UndefVectorExtracts) + std::swap(GatheredExtracts[Idx], SubVL[Idx]); + // Check that gather of extractelements can be represented as just a + // shuffle of a single/two vectors the scalars are extracted fromi. + SmallVector SubMask; + std::optional Res = + isFixedVectorShuffle(GatheredExtracts, SubMask); + if (!Res) { + // TODO: try to check other subsets if possible. + // Restore the original VL if attempt was not successful. + copy(SavedVL, SubVL.begin()); continue; } - VectorOpToIdx[EI->getVectorOperand()].push_back(I); - } - // Sort the vector operands by the maximum number of uses in extractelements. - MapVector> VFToVector; - for (const auto &Data : VectorOpToIdx) - VFToVector[cast(Data.first->getType())->getNumElements()] - .push_back(Data.first); - for (auto &Data : VFToVector) { - stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) { - return VectorOpToIdx.find(V1)->second.size() > - VectorOpToIdx.find(V2)->second.size(); - }); - } - // Find the best pair of the vectors with the same number of elements or a - // single vector. - const int UndefSz = UndefVectorExtracts.size(); - unsigned SingleMax = 0; - Value *SingleVec = nullptr; - unsigned PairMax = 0; - std::pair PairVec(nullptr, nullptr); - for (auto &Data : VFToVector) { - Value *V1 = Data.second.front(); - if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) { - SingleMax = VectorOpToIdx[V1].size() + UndefSz; - SingleVec = V1; - } - Value *V2 = nullptr; - if (Data.second.size() > 1) - V2 = *std::next(Data.second.begin()); - if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + - UndefSz) { - PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz; - PairVec = std::make_pair(V1, V2); - } - } - if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) - return std::nullopt; - // Check if better to perform a shuffle of 2 vectors or just of a single - // vector. - SmallVector SavedVL(VL.begin(), VL.end()); - SmallVector GatheredExtracts( - VL.size(), PoisonValue::get(VL.front()->getType())); - if (SingleMax >= PairMax && SingleMax) { - for (int Idx : VectorOpToIdx[SingleVec]) - std::swap(GatheredExtracts[Idx], VL[Idx]); - } else { - for (Value *V : {PairVec.first, PairVec.second}) - for (int Idx : VectorOpToIdx[V]) - std::swap(GatheredExtracts[Idx], VL[Idx]); - } - // Add extracts from undefs too. - for (int Idx : UndefVectorExtracts) - std::swap(GatheredExtracts[Idx], VL[Idx]); - // Check that gather of extractelements can be represented as just a - // shuffle of a single/two vectors the scalars are extracted from. - std::optional Res = - isFixedVectorShuffle(GatheredExtracts, Mask); - if (!Res) { - // TODO: try to check other subsets if possible. - // Restore the original VL if attempt was not successful. - VL.swap(SavedVL); - return std::nullopt; - } - // Restore unused scalars from mask, if some of the extractelements were not - // selected for shuffle. - for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) { - auto *EI = dyn_cast(VL[I]); - if (!EI || !isa(EI->getVectorOperandType()) || - !isa(EI->getIndexOperand()) || - is_contained(UndefVectorExtracts, I)) - continue; - if (Mask[I] == PoisonMaskElem && !isa(GatheredExtracts[I])) - std::swap(VL[I], GatheredExtracts[I]); + // Restore unused scalars from mask, if some of the extractelements were not + // selected for shuffle. + for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) { + auto *EI = dyn_cast(SubVL[I]); + if (!EI || !isa(EI->getVectorOperandType()) || + !isa(EI->getIndexOperand()) || + is_contained(UndefVectorExtracts, I)) + continue; + if (SubMask[I] == PoisonMaskElem && + !isa(GatheredExtracts[I])) + std::swap(SubVL[I], GatheredExtracts[I]); + } + ShufflesRes[Part] = Res; + copy(SubMask, std::next(Mask.begin(), Part * SliceSize)); + AtLeastOneFound = true; } - return Res; + if (!AtLeastOneFound) + ShufflesRes.clear(); + return ShufflesRes; } namespace { @@ -6938,64 +6955,58 @@ : R.getGatherCost(Gathers, !Root && VL.equals(Gathers))); }; - /// Compute the cost of creating a vector of type \p VecTy containing the - /// extracted values from \p VL. - InstructionCost computeExtractCost(ArrayRef VL, ArrayRef Mask, - TTI::ShuffleKind ShuffleKind) { - auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); - unsigned NumOfParts = TTI.getNumberOfParts(VecTy); - - if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || - !NumOfParts || VecTy->getNumElements() < NumOfParts) - return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); - - bool AllConsecutive = true; - unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts; - unsigned Idx = -1; + /// Compute the cost of creating a vector containing the extracted values from + /// \p VL. + InstructionCost + computeExtractCost(ArrayRef VL, ArrayRef Mask, + ArrayRef> ShuffleKinds, + unsigned NumParts) { + assert(VL.size() > NumParts && "Unexpected scalarized shuffle."); + unsigned EltsPerVector = VL.size() / NumParts; InstructionCost Cost = 0; // Process extracts in blocks of EltsPerVector to check if the source vector // operand can be re-used directly. If not, add the cost of creating a // shuffle to extract the values into a vector register. - SmallVector RegMask(EltsPerVector, PoisonMaskElem); - for (auto *V : VL) { - ++Idx; - - // Reached the start of a new vector registers. - if (Idx % EltsPerVector == 0) { - RegMask.assign(EltsPerVector, PoisonMaskElem); - AllConsecutive = true; + for (unsigned Part = 0; Part < NumParts; ++Part) { + if (!ShuffleKinds[Part]) continue; - } - - // Need to exclude undefs from analysis. - if (isa(V) || Mask[Idx] == PoisonMaskElem) + ArrayRef SubMask = Mask.slice(Part * EltsPerVector, EltsPerVector); + if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc) { + Cost += TTI.getShuffleCost( + *ShuffleKinds[Part], + FixedVectorType::get(VL.front()->getType(), EltsPerVector), + SubMask); continue; - - // Check all extracts for a vector register on the target directly - // extract values in order. - unsigned CurrentIdx = *getExtractIndex(cast(V)); - if (!isa(VL[Idx - 1]) && Mask[Idx - 1] != PoisonMaskElem) { - unsigned PrevIdx = *getExtractIndex(cast(VL[Idx - 1])); - AllConsecutive &= PrevIdx + 1 == CurrentIdx && - CurrentIdx % EltsPerVector == Idx % EltsPerVector; - RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector; } + bool AllConsecutive = true; + SmallVector RegMask(EltsPerVector, PoisonMaskElem); + ArrayRef SubVL = VL.slice(Part * EltsPerVector, EltsPerVector); + for (auto [I, V] : enumerate(SubVL)) { + // Need to exclude undefs from analysis. + if (isa(V) || SubMask[I] == PoisonMaskElem) + continue; - if (AllConsecutive) - continue; - - // Skip all indices, except for the last index per vector block. - if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size()) - continue; + // Check all extracts for a vector register on the target directly + // extract values in order. + unsigned CurrentIdx = *getExtractIndex(cast(V)); + if (I > 0 && !isa(SubVL[I - 1]) && + SubMask[I - 1] != PoisonMaskElem) { + unsigned PrevIdx = *getExtractIndex(cast(SubVL[I - 1])); + AllConsecutive &= + PrevIdx + 1 == CurrentIdx && CurrentIdx % EltsPerVector == I; + RegMask[I] = CurrentIdx % EltsPerVector; + } + } // If we have a series of extracts which are not consecutive and hence // cannot re-use the source vector register directly, compute the shuffle // cost to extract the vector with EltsPerVector elements. - Cost += TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, - FixedVectorType::get(VecTy->getElementType(), EltsPerVector), - RegMask); + if (!AllConsecutive) + Cost += TTI.getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, + FixedVectorType::get(VL.front()->getType(), EltsPerVector), + RegMask); } return Cost; } @@ -7101,90 +7112,61 @@ SmallPtrSetImpl &CheckedExtracts) : TTI(TTI), VectorizedVals(VectorizedVals), R(R), CheckedExtracts(CheckedExtracts) {} - Value *adjustExtracts(const TreeEntry *E, ArrayRef Mask, - TTI::ShuffleKind ShuffleKind) { + Value *adjustExtracts(const TreeEntry *E, MutableArrayRef Mask, + ArrayRef> ShuffleKinds, + unsigned NumParts) { if (Mask.empty()) return nullptr; Value *VecBase = nullptr; ArrayRef VL = E->Scalars; - auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); // If the resulting type is scalarized, do not adjust the cost. - unsigned VecNumParts = TTI.getNumberOfParts(VecTy); - if (VecNumParts == VecTy->getNumElements()) + if (NumParts == VL.size()) return nullptr; - DenseMap ExtractVectorsTys; - for (auto [I, V] : enumerate(VL)) { - // Ignore non-extractelement scalars. - if (isa(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem)) - continue; - // If all users of instruction are going to be vectorized and this - // instruction itself is not going to be vectorized, consider this - // instruction as dead and remove its cost from the final cost of the - // vectorized tree. - // Also, avoid adjusting the cost for extractelements with multiple uses - // in different graph entries. - const TreeEntry *VE = R.getTreeEntry(V); - if (!CheckedExtracts.insert(V).second || - !R.areAllUsersVectorized(cast(V), VectorizedVals) || - (VE && VE != E)) - continue; - auto *EE = cast(V); - VecBase = EE->getVectorOperand(); - std::optional EEIdx = getExtractIndex(EE); - if (!EEIdx) - continue; - unsigned Idx = *EEIdx; - if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) { - auto It = - ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; - It->getSecond() = std::min(It->second, Idx); - } - // Take credit for instruction that will become dead. - if (EE->hasOneUse()) { - Instruction *Ext = EE->user_back(); - if (isa(Ext) && all_of(Ext->users(), [](User *U) { - return isa(U); - })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), - EE->getVectorOperandType(), Idx); - // Add back the cost of s|zext which is subtracted separately. - Cost += TTI.getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EE->getType(), - TTI::getCastContextHint(Ext), CostKind, Ext); + unsigned SliceSize = VL.size() / NumParts; + for (unsigned Part = 0; Part < NumParts; ++Part) { + ArrayRef SubMask = Mask.slice(Part * SliceSize, SliceSize); + for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) { + // Ignore non-extractelement scalars. + if (isa(V) || + (!SubMask.empty() && SubMask[I] == PoisonMaskElem)) continue; - } - } - Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, - Idx); - } - // Add a cost for subvector extracts/inserts if required. - for (const auto &Data : ExtractVectorsTys) { - auto *EEVTy = cast(Data.first->getType()); - unsigned NumElts = VecTy->getNumElements(); - if (Data.second % NumElts == 0) - continue; - if (TTI.getNumberOfParts(EEVTy) > VecNumParts) { - unsigned Idx = (Data.second / NumElts) * NumElts; - unsigned EENumElts = EEVTy->getNumElements(); - if (Idx % NumElts == 0) + // If all users of instruction are going to be vectorized and this + // instruction itself is not going to be vectorized, consider this + // instruction as dead and remove its cost from the final cost of the + // vectorized tree. + // Also, avoid adjusting the cost for extractelements with multiple uses + // in different graph entries. + const TreeEntry *VE = R.getTreeEntry(V); + if (!CheckedExtracts.insert(V).second || + !R.areAllUsersVectorized(cast(V), VectorizedVals) || + (VE && VE != E)) continue; - if (Idx + NumElts <= EENumElts) { - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, VecTy); - } else { - // Need to round up the subvector type vectorization factor to avoid a - // crash in cost model functions. Make SubVT so that Idx + VF of SubVT - // <= EENumElts. - auto *SubVT = - FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, SubVT); + auto *EE = cast(V); + VecBase = EE->getVectorOperand(); + std::optional EEIdx = getExtractIndex(EE); + if (!EEIdx) + continue; + unsigned Idx = *EEIdx; + // Take credit for instruction that will become dead. + if (EE->hasOneUse()) { + Instruction *Ext = EE->user_back(); + if (isa(Ext) && all_of(Ext->users(), [](User *U) { + return isa(U); + })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + Cost -= + TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), + EE->getVectorOperandType(), Idx); + // Add back the cost of s|zext which is subtracted separately. + Cost += TTI.getCastInstrCost( + Ext->getOpcode(), Ext->getType(), EE->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + continue; + } } - } else { - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, - VecTy, std::nullopt, CostKind, 0, EEVTy); + Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), + CostKind, Idx); } } // Check that gather of extractelements can be represented as just a @@ -7192,7 +7174,7 @@ // Found the bunch of extractelement instructions that must be gathered // into a vector and can be represented as a permutation elements in a // single input vector or of 2 input vectors. - Cost += computeExtractCost(VL, Mask, ShuffleKind); + Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts); return VecBase; } void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef Mask) { @@ -7271,7 +7253,7 @@ assert((IsFinalized || CommonMask.empty()) && "Shuffle construction must be finalized."); } -}; + }; InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, @@ -7316,30 +7298,37 @@ reorderScalars(GatheredScalars, ReorderMask); SmallVector Mask; SmallVector ExtractMask; - std::optional ExtractShuffle; std::optional GatherShuffle; SmallVector Entries; Type *ScalarTy = GatheredScalars.front()->getType(); // Check for gathered extracts. - ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); + unsigned NumParts = TTI->getNumberOfParts(FixedVectorType::get( + GatheredScalars.front()->getType(), GatheredScalars.size())); + if (NumParts == 0) + NumParts = 1; + SmallVector> ExtractShuffles = + tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); SmallVector IgnoredVals; if (UserIgnoreList) IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); bool Resized = false; - if (Value *VecBase = Estimator.adjustExtracts( - E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) - if (auto *VecBaseTy = dyn_cast(VecBase->getType())) - if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { - Resized = true; - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } + if (!ExtractShuffles.empty()) { + if (Value *VecBase = Estimator.adjustExtracts(E, ExtractMask, + ExtractShuffles, NumParts)) + if (auto *VecBaseTy = dyn_cast(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && + GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + } // Do not try to look for reshuffled loads for gathered loads (they will be // handled later), for vectorized scalars, and cases, which are definitely // not profitable (splats and small gather nodes.) - if (ExtractShuffle || E->getOpcode() != Instruction::Load || + if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || E->isAltShuffle() || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || @@ -7410,7 +7399,7 @@ std::iota(ReuseMask.begin(), ReuseMask.end(), 0); Estimator.add(BV, ReuseMask); } - if (ExtractShuffle) + if (!ExtractShuffles.empty()) Estimator.add(E, std::nullopt); return Estimator.finalize(E->ReuseShuffleIndices); } @@ -9455,7 +9444,10 @@ : Builder(Builder), R(R) {} /// Adjusts extractelements after reusing them. - Value *adjustExtracts(const TreeEntry *E, ArrayRef Mask) { + Value *adjustExtracts(const TreeEntry *E, MutableArrayRef Mask, + unsigned NumParts, bool &UseVecBaseAsInput) { + UseVecBaseAsInput = false; + SmallPtrSet UniqueBases; Value *VecBase = nullptr; for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { int Idx = Mask[I]; @@ -9463,6 +9455,7 @@ continue; auto *EI = cast(E->Scalars[I]); VecBase = EI->getVectorOperand(); + UniqueBases.insert(VecBase); // If the only one use is vectorized - can delete the extractelement // itself. if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) { @@ -9471,7 +9464,71 @@ continue; R.eraseInstruction(EI); } - return VecBase; + if (NumParts == 1 || UniqueBases.size() == 1) + return VecBase; + UseVecBaseAsInput = true; + auto TransformToIdentity = [](MutableArrayRef Mask) { + for (auto [I, Idx] : enumerate(Mask)) + if (Idx != PoisonMaskElem) + Idx = I; + }; + Value *Vec = nullptr; + SmallVector VecMask(Mask.size(), PoisonMaskElem); + unsigned SliceSize = E->Scalars.size() / NumParts; + for (unsigned Part = 0; Part < NumParts; ++Part) { + ArrayRef VL = + ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize); + MutableArrayRef SubMask = Mask.slice(Part * SliceSize, SliceSize); + constexpr int MaxBases = 2; + SmallVector Bases(MaxBases); +#ifndef NDEBUG + int PrevSize = 0; +#endif // NDEBUG + for (const auto [I, V]: enumerate(VL)) { + if (SubMask[I] == PoisonMaskElem) + continue; + Value *VecOp = cast(V)->getVectorOperand(); + const int Size = + cast(VecOp->getType())->getNumElements(); +#ifndef NDEBUG + assert((PrevSize == Size || PrevSize == 0) && + "Expected vectors of the same size."); + PrevSize = Size; +#endif // NDEBUG + Bases[SubMask[I] < Size ? 0 : 1] = VecOp; + } + if (!Bases.front()) + continue; + Value *SubVec; + if (Bases.back()) { + SubVec = createShuffle(Bases.front(), Bases.back(), SubMask); + TransformToIdentity(SubMask); + } else { + SubVec = Bases.front(); + } + if (!Vec) { + Vec = SubVec; + copy(SubMask, VecMask.begin()); + } else { + unsigned VF = cast(Vec->getType())->getNumElements(); + if (Vec->getType() != SubVec->getType()) { + unsigned SubVecVF = + cast(SubVec->getType())->getNumElements(); + if (VF < SubVecVF) + TransformToIdentity(VecMask); + VF = std::max(VF, SubVecVF); + } + // Adjust SubMask. + for (auto [I, Idx] : enumerate(SubMask)) + if (Idx != PoisonMaskElem) + Idx += VF; + copy(SubMask, std::next(VecMask.begin(), Part * SliceSize)); + Vec = createShuffle(Vec, SubVec, VecMask); + TransformToIdentity(VecMask); + } + } + copy(VecMask, Mask.begin()); + return Vec; } /// Checks if the specified entry \p E needs to be delayed because of its /// dependency nodes. @@ -9796,26 +9853,39 @@ ResTy Res = ResTy(); SmallVector Mask; SmallVector ExtractMask; - std::optional ExtractShuffle; + SmallVector> ExtractShuffles; + Value *ExtractVecBase = nullptr; + bool UseVecBaseAsInput; std::optional GatherShuffle; SmallVector Entries; Type *ScalarTy = GatheredScalars.front()->getType(); if (!all_of(GatheredScalars, UndefValue::classof)) { // Check for gathered extracts. - ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); SmallVector IgnoredVals; if (UserIgnoreList) IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); bool Resized = false; - if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask)) - if (auto *VecBaseTy = dyn_cast(VecBase->getType())) - if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { - Resized = true; - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } + unsigned NumParts = TTI->getNumberOfParts(FixedVectorType::get( + GatheredScalars.front()->getType(), GatheredScalars.size())); + if (NumParts == 0) + NumParts = 1; + ExtractShuffles = + tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); + if (!ExtractShuffles.empty()) { + if (Value *VecBase = ShuffleBuilder.adjustExtracts( + E, ExtractMask, NumParts, UseVecBaseAsInput)) { + ExtractVecBase = VecBase; + if (auto *VecBaseTy = dyn_cast(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && + GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + } + } // Gather extracts after we check for full matched gathers only. - if (ExtractShuffle || E->getOpcode() != Instruction::Load || + if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || E->isAltShuffle() || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || @@ -9963,30 +10033,35 @@ } } }; - if (ExtractShuffle || GatherShuffle) { + if (!ExtractShuffles.empty() || GatherShuffle) { bool IsNonPoisoned = true; bool IsUsedInExpr = false; Value *Vec1 = nullptr; - if (ExtractShuffle) { + if (!ExtractShuffles.empty()) { // Gather of extractelements can be represented as just a shuffle of // a single/two vectors the scalars are extracted from. // Find input vectors. Value *Vec2 = nullptr; for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { - if (ExtractMask[I] == PoisonMaskElem || - (!Mask.empty() && Mask[I] != PoisonMaskElem)) { + if (!Mask.empty() && Mask[I] != PoisonMaskElem) ExtractMask[I] = PoisonMaskElem; - continue; - } - if (isa(E->Scalars[I])) - continue; - auto *EI = cast(E->Scalars[I]); - if (!Vec1) { - Vec1 = EI->getVectorOperand(); - } else if (Vec1 != EI->getVectorOperand()) { - assert((!Vec2 || Vec2 == EI->getVectorOperand()) && - "Expected only 1 or 2 vectors shuffle."); - Vec2 = EI->getVectorOperand(); + } + if (UseVecBaseAsInput) { + Vec1 = ExtractVecBase; + } else { + for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { + if (ExtractMask[I] == PoisonMaskElem) + continue; + if (isa(E->Scalars[I])) + continue; + auto *EI = cast(E->Scalars[I]); + if (!Vec1) { + Vec1 = EI->getVectorOperand(); + } else if (Vec1 != EI->getVectorOperand()) { + assert((!Vec2 || Vec2 == EI->getVectorOperand()) && + "Expected only 1 or 2 vectors shuffle."); + Vec2 = EI->getVectorOperand(); + } } } if (Vec2) { @@ -10025,10 +10100,14 @@ int MSz = Mask.size(); // Try to build constant vector and shuffle with it only if currently we // have a single permutation and more than 1 scalar constants. - bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle; + bool IsSingleShuffle = ExtractShuffles.empty() || !GatherShuffle; bool IsIdentityShuffle = - (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) == - TTI::SK_PermuteSingleSrc && + ((UseVecBaseAsInput || + all_of(ExtractShuffles, + [](const std::optional &SK) { + return SK.value_or(TTI::SK_PermuteTwoSrc) == + TTI::SK_PermuteSingleSrc; + })) && none_of(ExtractMask, [&](int I) { return I >= EMSz; }) && ShuffleVectorInst::isIdentityMask(ExtractMask)) || (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) == diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -75,64 +75,47 @@ ; CHECK-NEXT: [[TMP4TT_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_END_LOOPEXIT]] ] ; CHECK-NEXT: [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ] ; CHECK-NEXT: [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ] -; CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 0 -; CHECK-NEXT: [[VGETQ_LANE45:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 1 -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[VGETQ_LANE]], [[VGETQ_LANE45]] -; CHECK-NEXT: [[CONV48:%.*]] = trunc i64 [[ADD]] to i32 -; CHECK-NEXT: [[VGETQ_LANE51:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 0 -; CHECK-NEXT: [[VGETQ_LANE55:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 1 -; CHECK-NEXT: [[ADD57:%.*]] = add i64 [[VGETQ_LANE51]], [[VGETQ_LANE55]] -; CHECK-NEXT: [[CONV60:%.*]] = trunc i64 [[ADD57]] to i32 -; CHECK-NEXT: [[VGETQ_LANE63:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 0 -; CHECK-NEXT: [[VGETQ_LANE67:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 1 -; CHECK-NEXT: [[ADD69:%.*]] = add i64 [[VGETQ_LANE63]], [[VGETQ_LANE67]] -; CHECK-NEXT: [[CONV72:%.*]] = trunc i64 [[ADD69]] to i32 -; CHECK-NEXT: [[VGETQ_LANE75:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 0 -; CHECK-NEXT: [[VGETQ_LANE79:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 1 -; CHECK-NEXT: [[ADD81:%.*]] = add i64 [[VGETQ_LANE75]], [[VGETQ_LANE79]] -; CHECK-NEXT: [[CONV84:%.*]] = trunc i64 [[ADD81]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32> ; CHECK-NEXT: [[AND:%.*]] = and i32 [[NUMBEROFBOOLS]], 127 ; CHECK-NEXT: [[CMP86284:%.*]] = icmp ugt i32 [[AND]], 31 ; CHECK-NEXT: br i1 [[CMP86284]], label [[WHILE_BODY88:%.*]], label [[WHILE_END122:%.*]] ; CHECK: while.body88: ; CHECK-NEXT: [[PA_ADDR_1291:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_END121:%.*]] ], [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ] ; CHECK-NEXT: [[PB_ADDR_1290:%.*]] = phi ptr [ [[INCDEC_PTR89:%.*]], [[WHILE_END121]] ], [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ] -; CHECK-NEXT: [[_CTT_0289:%.*]] = phi i32 [ [[ADD99:%.*]], [[WHILE_END121]] ], [ [[CONV48]], [[WHILE_END]] ] -; CHECK-NEXT: [[_CFF_0288:%.*]] = phi i32 [ [[ADD106:%.*]], [[WHILE_END121]] ], [ [[CONV60]], [[WHILE_END]] ] -; CHECK-NEXT: [[_CTF_0287:%.*]] = phi i32 [ [[ADD113:%.*]], [[WHILE_END121]] ], [ [[CONV72]], [[WHILE_END]] ] -; CHECK-NEXT: [[_CFT_0286:%.*]] = phi i32 [ [[ADD120:%.*]], [[WHILE_END121]] ], [ [[CONV84]], [[WHILE_END]] ] ; CHECK-NEXT: [[NBBOOLBLOCK_1285:%.*]] = phi i32 [ [[SUB:%.*]], [[WHILE_END121]] ], [ [[AND]], [[WHILE_END]] ] -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP34:%.*]], [[WHILE_END121]] ], [ [[TMP17]], [[WHILE_END]] ] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4 ; CHECK-NEXT: br label [[WHILE_BODY93:%.*]] ; CHECK: while.body93: -; CHECK-NEXT: [[_CTT_1283:%.*]] = phi i32 [ [[_CTT_0289]], [[WHILE_BODY88]] ], [ [[ADD99]], [[WHILE_BODY93]] ] -; CHECK-NEXT: [[_CFF_1282:%.*]] = phi i32 [ [[_CFF_0288]], [[WHILE_BODY88]] ], [ [[ADD106]], [[WHILE_BODY93]] ] -; CHECK-NEXT: [[_CTF_1281:%.*]] = phi i32 [ [[_CTF_0287]], [[WHILE_BODY88]] ], [ [[ADD113]], [[WHILE_BODY93]] ] -; CHECK-NEXT: [[_CFT_1280:%.*]] = phi i32 [ [[_CFT_0286]], [[WHILE_BODY88]] ], [ [[ADD120]], [[WHILE_BODY93]] ] -; CHECK-NEXT: [[A_0279:%.*]] = phi i32 [ [[TMP10]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ] -; CHECK-NEXT: [[B_0278:%.*]] = phi i32 [ [[TMP11]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ] +; CHECK-NEXT: [[A_0279:%.*]] = phi i32 [ [[TMP19]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ] +; CHECK-NEXT: [[B_0278:%.*]] = phi i32 [ [[TMP20]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ] ; CHECK-NEXT: [[SHIFT_0277:%.*]] = phi i32 [ 0, [[WHILE_BODY88]] ], [ [[INC:%.*]], [[WHILE_BODY93]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi <4 x i32> [ [[TMP18]], [[WHILE_BODY88]] ], [ [[TMP34]], [[WHILE_BODY93]] ] ; CHECK-NEXT: [[AND94:%.*]] = and i32 [[A_0279]], 1 ; CHECK-NEXT: [[AND95:%.*]] = and i32 [[B_0278]], 1 ; CHECK-NEXT: [[SHR96]] = lshr i32 [[A_0279]], 1 ; CHECK-NEXT: [[SHR97]] = lshr i32 [[B_0278]], 1 -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[AND94]], 0 -; CHECK-NEXT: [[TOBOOL98:%.*]] = icmp ne i32 [[AND95]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL98]], i1 false -; CHECK-NEXT: [[LAND_EXT:%.*]] = zext i1 [[TMP12]] to i32 -; CHECK-NEXT: [[ADD99]] = add i32 [[_CTT_1283]], [[LAND_EXT]] -; CHECK-NEXT: [[TOBOOL100:%.*]] = icmp eq i32 [[AND94]], 0 -; CHECK-NEXT: [[TOBOOL103:%.*]] = icmp eq i32 [[AND95]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL103]], i1 false -; CHECK-NEXT: [[LAND_EXT105:%.*]] = zext i1 [[TMP13]] to i32 -; CHECK-NEXT: [[ADD106]] = add i32 [[_CFF_1282]], [[LAND_EXT105]] -; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL103]], i1 false -; CHECK-NEXT: [[LAND_EXT112:%.*]] = zext i1 [[TMP14]] to i32 -; CHECK-NEXT: [[ADD113]] = add i32 [[_CTF_1281]], [[LAND_EXT112]] -; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL98]], i1 false -; CHECK-NEXT: [[LAND_EXT119:%.*]] = zext i1 [[TMP15]] to i32 -; CHECK-NEXT: [[ADD120]] = add i32 [[_CFT_1280]], [[LAND_EXT119]] +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[AND94]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32> +; CHECK-NEXT: [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[SHIFT_0277]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 32 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[WHILE_END121]], label [[WHILE_BODY93]] @@ -144,61 +127,53 @@ ; CHECK-NEXT: br i1 [[CMP86]], label [[WHILE_BODY88]], label [[WHILE_END122]] ; CHECK: while.end122: ; CHECK-NEXT: [[NBBOOLBLOCK_1_LCSSA:%.*]] = phi i32 [ [[AND]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_END121]] ] -; CHECK-NEXT: [[_CFT_0_LCSSA:%.*]] = phi i32 [ [[CONV84]], [[WHILE_END]] ], [ [[ADD120]], [[WHILE_END121]] ] -; CHECK-NEXT: [[_CTF_0_LCSSA:%.*]] = phi i32 [ [[CONV72]], [[WHILE_END]] ], [ [[ADD113]], [[WHILE_END121]] ] -; CHECK-NEXT: [[_CFF_0_LCSSA:%.*]] = phi i32 [ [[CONV60]], [[WHILE_END]] ], [ [[ADD106]], [[WHILE_END121]] ] -; CHECK-NEXT: [[_CTT_0_LCSSA:%.*]] = phi i32 [ [[CONV48]], [[WHILE_END]] ], [ [[ADD99]], [[WHILE_END121]] ] ; CHECK-NEXT: [[PB_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR89]], [[WHILE_END121]] ] ; CHECK-NEXT: [[PA_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR]], [[WHILE_END121]] ] +; CHECK-NEXT: [[TMP35:%.*]] = phi <4 x i32> [ [[TMP17]], [[WHILE_END]] ], [ [[TMP34]], [[WHILE_END121]] ] ; CHECK-NEXT: [[CMP130_NOT299:%.*]] = icmp eq i32 [[NBBOOLBLOCK_1_LCSSA]], 0 ; CHECK-NEXT: br i1 [[CMP130_NOT299]], label [[WHILE_END166:%.*]], label [[WHILE_BODY132_PREHEADER:%.*]] ; CHECK: while.body132.preheader: -; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4 +; CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4 ; CHECK-NEXT: [[SUB125:%.*]] = sub nuw nsw i32 32, [[NBBOOLBLOCK_1_LCSSA]] -; CHECK-NEXT: [[SHR128:%.*]] = lshr i32 [[TMP16]], [[SUB125]] -; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4 -; CHECK-NEXT: [[SHR126:%.*]] = lshr i32 [[TMP17]], [[SUB125]] +; CHECK-NEXT: [[SHR128:%.*]] = lshr i32 [[TMP36]], [[SUB125]] +; CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4 +; CHECK-NEXT: [[SHR126:%.*]] = lshr i32 [[TMP37]], [[SUB125]] ; CHECK-NEXT: br label [[WHILE_BODY132:%.*]] ; CHECK: while.body132: -; CHECK-NEXT: [[_CTT_2306:%.*]] = phi i32 [ [[ADD142:%.*]], [[WHILE_BODY132]] ], [ [[_CTT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] -; CHECK-NEXT: [[_CFF_2305:%.*]] = phi i32 [ [[ADD150:%.*]], [[WHILE_BODY132]] ], [ [[_CFF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] -; CHECK-NEXT: [[_CTF_2304:%.*]] = phi i32 [ [[ADD157:%.*]], [[WHILE_BODY132]] ], [ [[_CTF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] -; CHECK-NEXT: [[_CFT_2303:%.*]] = phi i32 [ [[ADD164:%.*]], [[WHILE_BODY132]] ], [ [[_CFT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] ; CHECK-NEXT: [[NBBOOLBLOCK_2302:%.*]] = phi i32 [ [[DEC165:%.*]], [[WHILE_BODY132]] ], [ [[NBBOOLBLOCK_1_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] ; CHECK-NEXT: [[A_1301:%.*]] = phi i32 [ [[SHR135:%.*]], [[WHILE_BODY132]] ], [ [[SHR126]], [[WHILE_BODY132_PREHEADER]] ] ; CHECK-NEXT: [[B_1300:%.*]] = phi i32 [ [[SHR136:%.*]], [[WHILE_BODY132]] ], [ [[SHR128]], [[WHILE_BODY132_PREHEADER]] ] +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP51:%.*]], [[WHILE_BODY132]] ], [ [[TMP35]], [[WHILE_BODY132_PREHEADER]] ] ; CHECK-NEXT: [[AND133:%.*]] = and i32 [[A_1301]], 1 ; CHECK-NEXT: [[AND134:%.*]] = and i32 [[B_1300]], 1 ; CHECK-NEXT: [[SHR135]] = lshr i32 [[A_1301]], 1 ; CHECK-NEXT: [[SHR136]] = lshr i32 [[B_1300]], 1 -; CHECK-NEXT: [[TOBOOL137:%.*]] = icmp ne i32 [[AND133]], 0 -; CHECK-NEXT: [[TOBOOL139:%.*]] = icmp ne i32 [[AND134]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL139]], i1 false -; CHECK-NEXT: [[LAND_EXT141:%.*]] = zext i1 [[TMP18]] to i32 -; CHECK-NEXT: [[ADD142]] = add i32 [[_CTT_2306]], [[LAND_EXT141]] -; CHECK-NEXT: [[TOBOOL144:%.*]] = icmp eq i32 [[AND133]], 0 -; CHECK-NEXT: [[TOBOOL147:%.*]] = icmp eq i32 [[AND134]], 0 -; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL147]], i1 false -; CHECK-NEXT: [[LAND_EXT149:%.*]] = zext i1 [[TMP19]] to i32 -; CHECK-NEXT: [[ADD150]] = add i32 [[_CFF_2305]], [[LAND_EXT149]] -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL147]], i1 false -; CHECK-NEXT: [[LAND_EXT156:%.*]] = zext i1 [[TMP20]] to i32 -; CHECK-NEXT: [[ADD157]] = add i32 [[_CTF_2304]], [[LAND_EXT156]] -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL139]], i1 false -; CHECK-NEXT: [[LAND_EXT163:%.*]] = zext i1 [[TMP21]] to i32 -; CHECK-NEXT: [[ADD164]] = add i32 [[_CFT_2303]], [[LAND_EXT163]] +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <2 x i32> poison, i32 [[AND133]], i32 0 +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer +; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer +; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32> +; CHECK-NEXT: [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]] ; CHECK-NEXT: [[DEC165]] = add nsw i32 [[NBBOOLBLOCK_2302]], -1 ; CHECK-NEXT: [[CMP130_NOT:%.*]] = icmp eq i32 [[DEC165]], 0 ; CHECK-NEXT: br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]] ; CHECK: while.end166: -; CHECK-NEXT: [[_CFT_2_LCSSA:%.*]] = phi i32 [ [[_CFT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD164]], [[WHILE_BODY132]] ] -; CHECK-NEXT: [[_CTF_2_LCSSA:%.*]] = phi i32 [ [[_CTF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD157]], [[WHILE_BODY132]] ] -; CHECK-NEXT: [[_CFF_2_LCSSA:%.*]] = phi i32 [ [[_CFF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD150]], [[WHILE_BODY132]] ] -; CHECK-NEXT: [[_CTT_2_LCSSA:%.*]] = phi i32 [ [[_CTT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD142]], [[WHILE_BODY132]] ] -; CHECK-NEXT: store i32 [[_CTT_2_LCSSA]], ptr [[CTT:%.*]], align 4 -; CHECK-NEXT: store i32 [[_CFF_2_LCSSA]], ptr [[CFF:%.*]], align 4 -; CHECK-NEXT: store i32 [[_CTF_2_LCSSA]], ptr [[CTF:%.*]], align 4 -; CHECK-NEXT: store i32 [[_CFT_2_LCSSA]], ptr [[CFT:%.*]], align 4 +; CHECK-NEXT: [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ] +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2 +; CHECK-NEXT: store i32 [[TMP53]], ptr [[CTT:%.*]], align 4 +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3 +; CHECK-NEXT: store i32 [[TMP54]], ptr [[CFF:%.*]], align 4 +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1 +; CHECK-NEXT: store i32 [[TMP55]], ptr [[CTF:%.*]], align 4 +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0 +; CHECK-NEXT: store i32 [[TMP56]], ptr [[CFT:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll @@ -168,23 +168,23 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SSE-NEXT: ret <4 x double> [[R031]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[TMP7]] ; ; SLM-LABEL: @test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SLM-NEXT: ret <4 x double> [[R031]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x double> [[TMP7]] ; ; AVX-LABEL: @test_v4f64( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> @@ -241,20 +241,24 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <8 x float> [[TMP3]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP7]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R071]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP7]] ; ; AVX-LABEL: @test_v8f32( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> @@ -298,11 +302,31 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: @test_v4i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; SSE-LABEL: @test_v4i64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x i64> [[TMP7]] +; +; SLM-LABEL: @test_v4i64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x i64> [[TMP7]] +; +; AVX-LABEL: @test_v4i64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -324,11 +348,31 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @test_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @test_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @test_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX-LABEL: @test_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -368,19 +412,23 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> -; SSE-NEXT: ret <16 x i16> [[RV151]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: ret <16 x i16> [[TMP7]] ; ; SLM-LABEL: @test_v16i16( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SLM-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] -; SLM-NEXT: ret <16 x i16> [[TMP3]] +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: ret <16 x i16> [[TMP7]] ; ; AVX-LABEL: @test_v16i16( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -168,23 +168,23 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SSE-NEXT: ret <4 x double> [[R031]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[TMP7]] ; ; SLM-LABEL: @test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SLM-NEXT: ret <4 x double> [[R031]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x double> [[TMP7]] ; ; AVX-LABEL: @test_v4f64( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> @@ -241,20 +241,24 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <8 x float> [[TMP3]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP7]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R071]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP7]] ; ; AVX-LABEL: @test_v8f32( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> @@ -298,11 +302,31 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: @test_v4i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; SSE-LABEL: @test_v4i64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x i64> [[TMP7]] +; +; SLM-LABEL: @test_v4i64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x i64> [[TMP7]] +; +; AVX-LABEL: @test_v4i64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -324,11 +348,31 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @test_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @test_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @test_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX-LABEL: @test_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -368,19 +412,23 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> -; SSE-NEXT: ret <16 x i16> [[RV151]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: ret <16 x i16> [[TMP7]] ; ; SLM-LABEL: @test_v16i16( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SLM-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] -; SLM-NEXT: ret <16 x i16> [[TMP3]] +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: ret <16 x i16> [[TMP7]] ; ; AVX-LABEL: @test_v16i16( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll @@ -147,23 +147,23 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SSE-NEXT: ret <4 x double> [[R031]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[TMP7]] ; ; SLM-LABEL: @test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SLM-NEXT: ret <4 x double> [[R031]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x double> [[TMP7]] ; ; AVX-LABEL: @test_v4f64( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> @@ -192,20 +192,24 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <8 x float> [[TMP3]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP7]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R071]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP7]] ; ; AVX-LABEL: @test_v8f32( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> @@ -249,11 +253,31 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: @test_v4i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; SSE-LABEL: @test_v4i64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x i64> [[TMP7]] +; +; SLM-LABEL: @test_v4i64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x i64> [[TMP7]] +; +; AVX-LABEL: @test_v4i64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -275,11 +299,31 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @test_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @test_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @test_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX-LABEL: @test_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -319,19 +363,23 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> -; SSE-NEXT: ret <16 x i16> [[RV151]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: ret <16 x i16> [[TMP7]] ; ; SLM-LABEL: @test_v16i16( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SLM-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] -; SLM-NEXT: ret <16 x i16> [[TMP3]] +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: ret <16 x i16> [[TMP7]] ; ; AVX-LABEL: @test_v16i16( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll @@ -147,23 +147,23 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SSE-NEXT: ret <4 x double> [[R031]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[TMP7]] ; ; SLM-LABEL: @test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SLM-NEXT: ret <4 x double> [[R031]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x double> [[TMP7]] ; ; AVX-LABEL: @test_v4f64( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> @@ -192,20 +192,24 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <8 x float> [[TMP3]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP7]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R071]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP7]] ; ; AVX-LABEL: @test_v8f32( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> @@ -249,11 +253,31 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: @test_v4i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; SSE-LABEL: @test_v4i64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x i64> [[TMP7]] +; +; SLM-LABEL: @test_v4i64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x i64> [[TMP7]] +; +; AVX-LABEL: @test_v4i64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -275,11 +299,31 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @test_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @test_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @test_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX-LABEL: @test_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -319,19 +363,23 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> -; SSE-NEXT: ret <16 x i16> [[RV151]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: ret <16 x i16> [[TMP7]] ; ; SLM-LABEL: @test_v16i16( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SLM-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] -; SLM-NEXT: ret <16 x i16> [[TMP3]] +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: ret <16 x i16> [[TMP7]] ; ; AVX-LABEL: @test_v16i16( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll @@ -28,15 +28,13 @@ ; ; SSE42-LABEL: @reduce_and4( ; SSE42-NEXT: entry: -; SSE42-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V4:%.*]]) -; SSE42-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V3:%.*]]) -; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0]], [[TMP1]] -; SSE42-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V2:%.*]]) -; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP2]] -; SSE42-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V1:%.*]]) -; SSE42-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP3]] -; SSE42-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[ACC:%.*]] -; SSE42-NEXT: ret i32 [[OP_RDX3]] +; SSE42-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE42-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; SSE42-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) +; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]] +; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] +; SSE42-NEXT: ret i32 [[OP_RDX1]] ; ; AVX-LABEL: @reduce_and4( ; AVX-NEXT: entry: @@ -103,15 +101,13 @@ ; SSE2-NEXT: ret i32 [[OP_RDX1]] ; ; SSE42-LABEL: @reduce_and4_transpose( -; SSE42-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V4:%.*]]) -; SSE42-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V3:%.*]]) -; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP2]] -; SSE42-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V2:%.*]]) -; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP3]] -; SSE42-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V1:%.*]]) -; SSE42-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP4]] -; SSE42-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[ACC:%.*]] -; SSE42-NEXT: ret i32 [[OP_RDX3]] +; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE42-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE42-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) +; SSE42-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]] +; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] +; SSE42-NEXT: ret i32 [[OP_RDX1]] ; ; AVX-LABEL: @reduce_and4_transpose( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32>