diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -544,110 +544,127 @@ /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was /// successful, the matched scalars are replaced by poison values in \p VL for /// future analysis. -static std::optional +static SmallVector> tryToGatherExtractElements(SmallVectorImpl &VL, - SmallVectorImpl &Mask) { - // Scan list of gathered scalars for extractelements that can be represented - // as shuffles. - MapVector> VectorOpToIdx; - SmallVector UndefVectorExtracts; - for (int I = 0, E = VL.size(); I < E; ++I) { - auto *EI = dyn_cast(VL[I]); - if (!EI) { - if (isa(VL[I])) + SmallVectorImpl &Mask, unsigned NumParts) { + assert(NumParts > 0 && "NumParts expected be greater than or equal to 1."); + SmallVector> ShufflesRes(NumParts); + Mask.assign(VL.size(), PoisonMaskElem); + unsigned SliceSize = VL.size() / NumParts; + bool AtLeastOneFound = false; + for (unsigned Part = 0; Part < NumParts; ++Part) { + // Scan list of gathered scalars for extractelements that can be represented + // as shuffles. + MutableArrayRef SubVL = + MutableArrayRef(VL).slice(Part * SliceSize, SliceSize); + MapVector> VectorOpToIdx; + SmallVector UndefVectorExtracts; + for (int I = 0, E = SubVL.size(); I < E; ++I) { + auto *EI = dyn_cast(SubVL[I]); + if (!EI) { + if (isa(SubVL[I])) + UndefVectorExtracts.push_back(I); + continue; + } + auto *VecTy = dyn_cast(EI->getVectorOperandType()); + if (!VecTy || !isa(EI->getIndexOperand())) + continue; + std::optional Idx = getExtractIndex(EI); + // Undefined index. + if (!Idx) { UndefVectorExtracts.push_back(I); - continue; + continue; + } + SmallBitVector ExtractMask(VecTy->getNumElements(), true); + ExtractMask.reset(*Idx); + if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { + UndefVectorExtracts.push_back(I); + continue; + } + VectorOpToIdx[EI->getVectorOperand()].push_back(I); + } + // Sort the vector operands by the maximum number of uses in + // extractelements. + MapVector> VFToVector; + for (const auto &Data : VectorOpToIdx) + VFToVector[cast(Data.first->getType())->getNumElements()] + .push_back(Data.first); + for (auto &Data : VFToVector) { + stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) { + return VectorOpToIdx.find(V1)->second.size() > + VectorOpToIdx.find(V2)->second.size(); + }); } - auto *VecTy = dyn_cast(EI->getVectorOperandType()); - if (!VecTy || !isa(EI->getIndexOperand())) - continue; - std::optional Idx = getExtractIndex(EI); - // Undefined index. - if (!Idx) { - UndefVectorExtracts.push_back(I); + // Find the best pair of the vectors with the same number of elements or a + // single vector. + const int UndefSz = UndefVectorExtracts.size(); + unsigned SingleMax = 0; + Value *SingleVec = nullptr; + unsigned PairMax = 0; + std::pair PairVec(nullptr, nullptr); + for (auto &Data : VFToVector) { + Value *V1 = Data.second.front(); + if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) { + SingleMax = VectorOpToIdx[V1].size() + UndefSz; + SingleVec = V1; + } + Value *V2 = nullptr; + if (Data.second.size() > 1) + V2 = *std::next(Data.second.begin()); + if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + + UndefSz) { + PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz; + PairVec = std::make_pair(V1, V2); + } + } + if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) continue; + // Check if better to perform a shuffle of 2 vectors or just of a single + // vector. + SmallVector SavedVL(SubVL.begin(), SubVL.end()); + SmallVector GatheredExtracts( + SubVL.size(), PoisonValue::get(SubVL.front()->getType())); + if (SingleMax >= PairMax && SingleMax) { + for (int Idx : VectorOpToIdx[SingleVec]) + std::swap(GatheredExtracts[Idx], SubVL[Idx]); + } else { + for (Value *V : {PairVec.first, PairVec.second}) + for (int Idx : VectorOpToIdx[V]) + std::swap(GatheredExtracts[Idx], SubVL[Idx]); } - SmallBitVector ExtractMask(VecTy->getNumElements(), true); - ExtractMask.reset(*Idx); - if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { - UndefVectorExtracts.push_back(I); + // Add extracts from undefs too. + for (int Idx : UndefVectorExtracts) + std::swap(GatheredExtracts[Idx], SubVL[Idx]); + // Check that gather of extractelements can be represented as just a + // shuffle of a single/two vectors the scalars are extracted fromi. + SmallVector SubMask; + std::optional Res = + isFixedVectorShuffle(GatheredExtracts, SubMask); + if (!Res) { + // TODO: try to check other subsets if possible. + // Restore the original VL if attempt was not successful. + copy(SavedVL, SubVL.begin()); continue; } - VectorOpToIdx[EI->getVectorOperand()].push_back(I); - } - // Sort the vector operands by the maximum number of uses in extractelements. - MapVector> VFToVector; - for (const auto &Data : VectorOpToIdx) - VFToVector[cast(Data.first->getType())->getNumElements()] - .push_back(Data.first); - for (auto &Data : VFToVector) { - stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) { - return VectorOpToIdx.find(V1)->second.size() > - VectorOpToIdx.find(V2)->second.size(); - }); - } - // Find the best pair of the vectors with the same number of elements or a - // single vector. - const int UndefSz = UndefVectorExtracts.size(); - unsigned SingleMax = 0; - Value *SingleVec = nullptr; - unsigned PairMax = 0; - std::pair PairVec(nullptr, nullptr); - for (auto &Data : VFToVector) { - Value *V1 = Data.second.front(); - if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) { - SingleMax = VectorOpToIdx[V1].size() + UndefSz; - SingleVec = V1; - } - Value *V2 = nullptr; - if (Data.second.size() > 1) - V2 = *std::next(Data.second.begin()); - if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + - UndefSz) { - PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz; - PairVec = std::make_pair(V1, V2); - } - } - if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) - return std::nullopt; - // Check if better to perform a shuffle of 2 vectors or just of a single - // vector. - SmallVector SavedVL(VL.begin(), VL.end()); - SmallVector GatheredExtracts( - VL.size(), PoisonValue::get(VL.front()->getType())); - if (SingleMax >= PairMax && SingleMax) { - for (int Idx : VectorOpToIdx[SingleVec]) - std::swap(GatheredExtracts[Idx], VL[Idx]); - } else { - for (Value *V : {PairVec.first, PairVec.second}) - for (int Idx : VectorOpToIdx[V]) - std::swap(GatheredExtracts[Idx], VL[Idx]); - } - // Add extracts from undefs too. - for (int Idx : UndefVectorExtracts) - std::swap(GatheredExtracts[Idx], VL[Idx]); - // Check that gather of extractelements can be represented as just a - // shuffle of a single/two vectors the scalars are extracted from. - std::optional Res = - isFixedVectorShuffle(GatheredExtracts, Mask); - if (!Res) { - // TODO: try to check other subsets if possible. - // Restore the original VL if attempt was not successful. - VL.swap(SavedVL); - return std::nullopt; - } - // Restore unused scalars from mask, if some of the extractelements were not - // selected for shuffle. - for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) { - auto *EI = dyn_cast(VL[I]); - if (!EI || !isa(EI->getVectorOperandType()) || - !isa(EI->getIndexOperand()) || - is_contained(UndefVectorExtracts, I)) - continue; - if (Mask[I] == PoisonMaskElem && !isa(GatheredExtracts[I])) - std::swap(VL[I], GatheredExtracts[I]); + // Restore unused scalars from mask, if some of the extractelements were not + // selected for shuffle. + for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) { + auto *EI = dyn_cast(SubVL[I]); + if (!EI || !isa(EI->getVectorOperandType()) || + !isa(EI->getIndexOperand()) || + is_contained(UndefVectorExtracts, I)) + continue; + if (SubMask[I] == PoisonMaskElem && + !isa(GatheredExtracts[I])) + std::swap(SubVL[I], GatheredExtracts[I]); + } + ShufflesRes[Part] = Res; + copy(SubMask, std::next(Mask.begin(), Part * SliceSize)); + AtLeastOneFound = true; } - return Res; + if (!AtLeastOneFound) + ShufflesRes.clear(); + return ShufflesRes; } namespace { @@ -6867,64 +6884,58 @@ : R.getGatherCost(Gathers, !Root && VL.equals(Gathers))); }; - /// Compute the cost of creating a vector of type \p VecTy containing the - /// extracted values from \p VL. - InstructionCost computeExtractCost(ArrayRef VL, ArrayRef Mask, - TTI::ShuffleKind ShuffleKind) { - auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); - unsigned NumOfParts = TTI.getNumberOfParts(VecTy); - - if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || - !NumOfParts || VecTy->getNumElements() < NumOfParts) - return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); - - bool AllConsecutive = true; - unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts; - unsigned Idx = -1; + /// Compute the cost of creating a vector containing the extracted values from + /// \p VL. + InstructionCost + computeExtractCost(ArrayRef VL, ArrayRef Mask, + ArrayRef> ShuffleKinds, + unsigned NumParts) { + assert(VL.size() > NumParts && "Unexpected scalarized shuffle."); + unsigned EltsPerVector = VL.size() / NumParts; InstructionCost Cost = 0; // Process extracts in blocks of EltsPerVector to check if the source vector // operand can be re-used directly. If not, add the cost of creating a // shuffle to extract the values into a vector register. - SmallVector RegMask(EltsPerVector, PoisonMaskElem); - for (auto *V : VL) { - ++Idx; - - // Reached the start of a new vector registers. - if (Idx % EltsPerVector == 0) { - RegMask.assign(EltsPerVector, PoisonMaskElem); - AllConsecutive = true; + for (unsigned Part = 0; Part < NumParts; ++Part) { + if (!ShuffleKinds[Part]) continue; - } - - // Need to exclude undefs from analysis. - if (isa(V) || Mask[Idx] == PoisonMaskElem) + ArrayRef SubMask = Mask.slice(Part * EltsPerVector, EltsPerVector); + if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc) { + Cost += TTI.getShuffleCost( + *ShuffleKinds[Part], + FixedVectorType::get(VL.front()->getType(), EltsPerVector), + SubMask); continue; - - // Check all extracts for a vector register on the target directly - // extract values in order. - unsigned CurrentIdx = *getExtractIndex(cast(V)); - if (!isa(VL[Idx - 1]) && Mask[Idx - 1] != PoisonMaskElem) { - unsigned PrevIdx = *getExtractIndex(cast(VL[Idx - 1])); - AllConsecutive &= PrevIdx + 1 == CurrentIdx && - CurrentIdx % EltsPerVector == Idx % EltsPerVector; - RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector; } + bool AllConsecutive = true; + SmallVector RegMask(EltsPerVector, PoisonMaskElem); + ArrayRef SubVL = VL.slice(Part * EltsPerVector, EltsPerVector); + for (auto [I, V] : enumerate(SubVL)) { + // Need to exclude undefs from analysis. + if (isa(V) || SubMask[I] == PoisonMaskElem) + continue; - if (AllConsecutive) - continue; - - // Skip all indices, except for the last index per vector block. - if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size()) - continue; + // Check all extracts for a vector register on the target directly + // extract values in order. + unsigned CurrentIdx = *getExtractIndex(cast(V)); + if (I > 0 && !isa(SubVL[I - 1]) && + SubMask[I - 1] != PoisonMaskElem) { + unsigned PrevIdx = *getExtractIndex(cast(SubVL[I - 1])); + AllConsecutive &= + PrevIdx + 1 == CurrentIdx && CurrentIdx % EltsPerVector == I; + RegMask[I] = CurrentIdx % EltsPerVector; + } + } // If we have a series of extracts which are not consecutive and hence // cannot re-use the source vector register directly, compute the shuffle // cost to extract the vector with EltsPerVector elements. - Cost += TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, - FixedVectorType::get(VecTy->getElementType(), EltsPerVector), - RegMask); + if (!AllConsecutive) + Cost += TTI.getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, + FixedVectorType::get(VL.front()->getType(), EltsPerVector), + RegMask); } return Cost; } @@ -7019,90 +7030,61 @@ SmallPtrSetImpl &CheckedExtracts) : TTI(TTI), VectorizedVals(VectorizedVals), R(R), CheckedExtracts(CheckedExtracts) {} - Value *adjustExtracts(const TreeEntry *E, ArrayRef Mask, - TTI::ShuffleKind ShuffleKind) { + Value *adjustExtracts(const TreeEntry *E, MutableArrayRef Mask, + ArrayRef> ShuffleKinds, + unsigned NumParts) { if (Mask.empty()) return nullptr; Value *VecBase = nullptr; ArrayRef VL = E->Scalars; - auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); // If the resulting type is scalarized, do not adjust the cost. - unsigned VecNumParts = TTI.getNumberOfParts(VecTy); - if (VecNumParts == VecTy->getNumElements()) + if (NumParts == VL.size()) return nullptr; - DenseMap ExtractVectorsTys; - for (auto [I, V] : enumerate(VL)) { - // Ignore non-extractelement scalars. - if (isa(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem)) - continue; - // If all users of instruction are going to be vectorized and this - // instruction itself is not going to be vectorized, consider this - // instruction as dead and remove its cost from the final cost of the - // vectorized tree. - // Also, avoid adjusting the cost for extractelements with multiple uses - // in different graph entries. - const TreeEntry *VE = R.getTreeEntry(V); - if (!CheckedExtracts.insert(V).second || - !R.areAllUsersVectorized(cast(V), VectorizedVals) || - (VE && VE != E)) - continue; - auto *EE = cast(V); - VecBase = EE->getVectorOperand(); - std::optional EEIdx = getExtractIndex(EE); - if (!EEIdx) - continue; - unsigned Idx = *EEIdx; - if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) { - auto It = - ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; - It->getSecond() = std::min(It->second, Idx); - } - // Take credit for instruction that will become dead. - if (EE->hasOneUse()) { - Instruction *Ext = EE->user_back(); - if (isa(Ext) && all_of(Ext->users(), [](User *U) { - return isa(U); - })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), - EE->getVectorOperandType(), Idx); - // Add back the cost of s|zext which is subtracted separately. - Cost += TTI.getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EE->getType(), - TTI::getCastContextHint(Ext), CostKind, Ext); + unsigned SliceSize = VL.size() / NumParts; + for (unsigned Part = 0; Part < NumParts; ++Part) { + ArrayRef SubMask = Mask.slice(Part * SliceSize, SliceSize); + for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) { + // Ignore non-extractelement scalars. + if (isa(V) || + (!SubMask.empty() && SubMask[I] == PoisonMaskElem)) continue; - } - } - Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, - Idx); - } - // Add a cost for subvector extracts/inserts if required. - for (const auto &Data : ExtractVectorsTys) { - auto *EEVTy = cast(Data.first->getType()); - unsigned NumElts = VecTy->getNumElements(); - if (Data.second % NumElts == 0) - continue; - if (TTI.getNumberOfParts(EEVTy) > VecNumParts) { - unsigned Idx = (Data.second / NumElts) * NumElts; - unsigned EENumElts = EEVTy->getNumElements(); - if (Idx % NumElts == 0) + // If all users of instruction are going to be vectorized and this + // instruction itself is not going to be vectorized, consider this + // instruction as dead and remove its cost from the final cost of the + // vectorized tree. + // Also, avoid adjusting the cost for extractelements with multiple uses + // in different graph entries. + const TreeEntry *VE = R.getTreeEntry(V); + if (!CheckedExtracts.insert(V).second || + !R.areAllUsersVectorized(cast(V), VectorizedVals) || + (VE && VE != E)) continue; - if (Idx + NumElts <= EENumElts) { - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, VecTy); - } else { - // Need to round up the subvector type vectorization factor to avoid a - // crash in cost model functions. Make SubVT so that Idx + VF of SubVT - // <= EENumElts. - auto *SubVT = - FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, SubVT); + auto *EE = cast(V); + VecBase = EE->getVectorOperand(); + std::optional EEIdx = getExtractIndex(EE); + if (!EEIdx) + continue; + unsigned Idx = *EEIdx; + // Take credit for instruction that will become dead. + if (EE->hasOneUse()) { + Instruction *Ext = EE->user_back(); + if (isa(Ext) && all_of(Ext->users(), [](User *U) { + return isa(U); + })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + Cost -= + TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), + EE->getVectorOperandType(), Idx); + // Add back the cost of s|zext which is subtracted separately. + Cost += TTI.getCastInstrCost( + Ext->getOpcode(), Ext->getType(), EE->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + continue; + } } - } else { - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, - VecTy, std::nullopt, CostKind, 0, EEVTy); + Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), + CostKind, Idx); } } // Check that gather of extractelements can be represented as just a @@ -7110,7 +7092,7 @@ // Found the bunch of extractelement instructions that must be gathered // into a vector and can be represented as a permutation elements in a // single input vector or of 2 input vectors. - Cost += computeExtractCost(VL, Mask, ShuffleKind); + Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts); return VecBase; } void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef Mask) { @@ -7189,7 +7171,7 @@ assert((IsFinalized || CommonMask.empty()) && "Shuffle construction must be finalized."); } -}; + }; InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, @@ -7234,30 +7216,37 @@ reorderScalars(GatheredScalars, ReorderMask); SmallVector Mask; SmallVector ExtractMask; - std::optional ExtractShuffle; std::optional GatherShuffle; SmallVector Entries; Type *ScalarTy = GatheredScalars.front()->getType(); // Check for gathered extracts. - ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); + unsigned NumParts = TTI->getNumberOfParts(FixedVectorType::get( + GatheredScalars.front()->getType(), GatheredScalars.size())); + if (NumParts == 0) + NumParts = 1; + SmallVector> ExtractShuffles = + tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); SmallVector IgnoredVals; if (UserIgnoreList) IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); bool Resized = false; - if (Value *VecBase = Estimator.adjustExtracts( - E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) - if (auto *VecBaseTy = dyn_cast(VecBase->getType())) - if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { - Resized = true; - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } + if (!ExtractShuffles.empty()) { + if (Value *VecBase = Estimator.adjustExtracts(E, ExtractMask, + ExtractShuffles, NumParts)) + if (auto *VecBaseTy = dyn_cast(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && + GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + } // Do not try to look for reshuffled loads for gathered loads (they will be // handled later), for vectorized scalars, and cases, which are definitely // not profitable (splats and small gather nodes.) - if (ExtractShuffle || E->getOpcode() != Instruction::Load || + if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || E->isAltShuffle() || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || @@ -7318,7 +7307,7 @@ std::iota(ReuseMask.begin(), ReuseMask.end(), 0); Estimator.add(BV, ReuseMask); } - if (ExtractShuffle) + if (!ExtractShuffles.empty()) Estimator.add(E, std::nullopt); return Estimator.finalize(E->ReuseShuffleIndices); } @@ -9364,7 +9353,10 @@ : Builder(Builder), R(R) {} /// Adjusts extractelements after reusing them. - Value *adjustExtracts(const TreeEntry *E, ArrayRef Mask) { + Value *adjustExtracts(const TreeEntry *E, MutableArrayRef Mask, + unsigned NumParts, bool &UseVecBaseAsInput) { + UseVecBaseAsInput = false; + SmallPtrSet UniqueBases; Value *VecBase = nullptr; for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { int Idx = Mask[I]; @@ -9372,6 +9364,7 @@ continue; auto *EI = cast(E->Scalars[I]); VecBase = EI->getVectorOperand(); + UniqueBases.insert(VecBase); // If the only one use is vectorized - can delete the extractelement // itself. if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) { @@ -9380,7 +9373,71 @@ continue; R.eraseInstruction(EI); } - return VecBase; + if (NumParts == 1 || UniqueBases.size() == 1) + return VecBase; + UseVecBaseAsInput = true; + auto TransformToIdentity = [](MutableArrayRef Mask) { + for (auto [I, Idx] : enumerate(Mask)) + if (Idx != PoisonMaskElem) + Idx = I; + }; + Value *Vec = nullptr; + SmallVector VecMask(Mask.size(), PoisonMaskElem); + unsigned SliceSize = E->Scalars.size() / NumParts; + for (unsigned Part = 0; Part < NumParts; ++Part) { + ArrayRef VL = + ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize); + MutableArrayRef SubMask = Mask.slice(Part * SliceSize, SliceSize); + constexpr int MaxBases = 2; + SmallVector Bases(MaxBases); +#ifndef NDEBUG + int PrevSize = 0; +#endif // NDEBUG + for (const auto [I, V]: enumerate(VL)) { + if (SubMask[I] == PoisonMaskElem) + continue; + Value *VecOp = cast(V)->getVectorOperand(); + const int Size = + cast(VecOp->getType())->getNumElements(); +#ifndef NDEBUG + assert((PrevSize == Size || PrevSize == 0) && + "Expected vectors of the same size."); + PrevSize = Size; +#endif // NDEBUG + Bases[SubMask[I] < Size ? 0 : 1] = VecOp; + } + if (!Bases.front()) + continue; + Value *SubVec; + if (Bases.back()) { + SubVec = createShuffle(Bases.front(), Bases.back(), SubMask); + TransformToIdentity(SubMask); + } else { + SubVec = Bases.front(); + } + if (!Vec) { + Vec = SubVec; + copy(SubMask, VecMask.begin()); + } else { + unsigned VF = cast(Vec->getType())->getNumElements(); + if (Vec->getType() != SubVec->getType()) { + unsigned SubVecVF = + cast(SubVec->getType())->getNumElements(); + if (VF < SubVecVF) + TransformToIdentity(VecMask); + VF = std::max(VF, SubVecVF); + } + // Adjust SubMask. + for (auto [I, Idx] : enumerate(SubMask)) + if (Idx != PoisonMaskElem) + Idx += VF; + copy(SubMask, std::next(VecMask.begin(), Part * SliceSize)); + Vec = createShuffle(Vec, SubVec, VecMask); + TransformToIdentity(VecMask); + } + } + copy(VecMask, Mask.begin()); + return Vec; } /// Checks if the specified entry \p E needs to be delayed because of its /// dependency nodes. @@ -9705,26 +9762,39 @@ ResTy Res = ResTy(); SmallVector Mask; SmallVector ExtractMask; - std::optional ExtractShuffle; + SmallVector> ExtractShuffles; + Value *ExtractVecBase = nullptr; + bool UseVecBaseAsInput; std::optional GatherShuffle; SmallVector Entries; Type *ScalarTy = GatheredScalars.front()->getType(); if (!all_of(GatheredScalars, UndefValue::classof)) { // Check for gathered extracts. - ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); SmallVector IgnoredVals; if (UserIgnoreList) IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); bool Resized = false; - if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask)) - if (auto *VecBaseTy = dyn_cast(VecBase->getType())) - if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { - Resized = true; - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } + unsigned NumParts = TTI->getNumberOfParts(FixedVectorType::get( + GatheredScalars.front()->getType(), GatheredScalars.size())); + if (NumParts == 0) + NumParts = 1; + ExtractShuffles = + tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); + if (!ExtractShuffles.empty()) { + if (Value *VecBase = ShuffleBuilder.adjustExtracts( + E, ExtractMask, NumParts, UseVecBaseAsInput)) { + ExtractVecBase = VecBase; + if (auto *VecBaseTy = dyn_cast(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && + GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + } + } // Gather extracts after we check for full matched gathers only. - if (ExtractShuffle || E->getOpcode() != Instruction::Load || + if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || E->isAltShuffle() || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || @@ -9865,30 +9935,35 @@ } } }; - if (ExtractShuffle || GatherShuffle) { + if (!ExtractShuffles.empty() || GatherShuffle) { bool IsNonPoisoned = true; bool IsUsedInExpr = false; Value *Vec1 = nullptr; - if (ExtractShuffle) { + if (!ExtractShuffles.empty()) { // Gather of extractelements can be represented as just a shuffle of // a single/two vectors the scalars are extracted from. // Find input vectors. Value *Vec2 = nullptr; for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { - if (ExtractMask[I] == PoisonMaskElem || - (!Mask.empty() && Mask[I] != PoisonMaskElem)) { + if (!Mask.empty() && Mask[I] != PoisonMaskElem) ExtractMask[I] = PoisonMaskElem; - continue; - } - if (isa(E->Scalars[I])) - continue; - auto *EI = cast(E->Scalars[I]); - if (!Vec1) { - Vec1 = EI->getVectorOperand(); - } else if (Vec1 != EI->getVectorOperand()) { - assert((!Vec2 || Vec2 == EI->getVectorOperand()) && - "Expected only 1 or 2 vectors shuffle."); - Vec2 = EI->getVectorOperand(); + } + if (UseVecBaseAsInput) { + Vec1 = ExtractVecBase; + } else { + for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { + if (ExtractMask[I] == PoisonMaskElem) + continue; + if (isa(E->Scalars[I])) + continue; + auto *EI = cast(E->Scalars[I]); + if (!Vec1) { + Vec1 = EI->getVectorOperand(); + } else if (Vec1 != EI->getVectorOperand()) { + assert((!Vec2 || Vec2 == EI->getVectorOperand()) && + "Expected only 1 or 2 vectors shuffle."); + Vec2 = EI->getVectorOperand(); + } } } if (Vec2) { @@ -9927,10 +10002,14 @@ int MSz = Mask.size(); // Try to build constant vector and shuffle with it only if currently we // have a single permutation and more than 1 scalar constants. - bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle; + bool IsSingleShuffle = ExtractShuffles.empty() || !GatherShuffle; bool IsIdentityShuffle = - (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) == - TTI::SK_PermuteSingleSrc && + ((UseVecBaseAsInput || + all_of(ExtractShuffles, + [](const std::optional &SK) { + return SK.value_or(TTI::SK_PermuteTwoSrc) == + TTI::SK_PermuteSingleSrc; + })) && none_of(ExtractMask, [&](int I) { return I >= EMSz; }) && ShuffleVectorInst::isIdentityMask(ExtractMask)) || (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) == diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -3,27 +3,21 @@ define void @test(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1:%.*]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP0:%.*]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP14]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 -; CHECK-NEXT: br label [[TMP17:%.*]] -; CHECK: 17: -; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP22:%.*]], [[TMP17]] ], [ [[TMP6]], [[TMP3:%.*]] ] -; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP9]], [[TMP3]] ] -; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP13]], [[TMP3]] ] -; CHECK-NEXT: [[TMP21:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP16]], [[TMP3]] ] -; CHECK-NEXT: [[TMP22]] = or i32 [[TMP18]], 0 -; CHECK-NEXT: br label [[TMP17]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1:%.*]], <2 x i64> [[TMP0:%.*]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2:%.*]], <2 x i64> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> , <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; CHECK-NEXT: br label [[TMP11:%.*]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[TMP11]] ], [ [[TMP10]], [[TMP3:%.*]] ] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP16]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> +; CHECK-NEXT: br label [[TMP11]] ; %4 = extractelement <2 x i64> %1, i64 0 %5 = or i64 %4, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll @@ -168,23 +168,23 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SSE-NEXT: ret <4 x double> [[R031]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[TMP7]] ; ; SLM-LABEL: @test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SLM-NEXT: ret <4 x double> [[R031]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x double> [[TMP7]] ; ; AVX-LABEL: @test_v4f64( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> @@ -241,20 +241,24 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <8 x float> [[TMP3]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP7]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R071]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP7]] ; ; AVX-LABEL: @test_v8f32( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> @@ -298,11 +302,31 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: @test_v4i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; SSE-LABEL: @test_v4i64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x i64> [[TMP7]] +; +; SLM-LABEL: @test_v4i64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x i64> [[TMP7]] +; +; AVX-LABEL: @test_v4i64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -324,11 +348,31 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @test_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @test_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @test_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX-LABEL: @test_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -368,19 +412,23 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> -; SSE-NEXT: ret <16 x i16> [[RV151]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: ret <16 x i16> [[TMP7]] ; ; SLM-LABEL: @test_v16i16( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SLM-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] -; SLM-NEXT: ret <16 x i16> [[TMP3]] +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: ret <16 x i16> [[TMP7]] ; ; AVX-LABEL: @test_v16i16( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -168,23 +168,23 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SSE-NEXT: ret <4 x double> [[R031]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[TMP7]] ; ; SLM-LABEL: @test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SLM-NEXT: ret <4 x double> [[R031]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x double> [[TMP7]] ; ; AVX-LABEL: @test_v4f64( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> @@ -241,20 +241,24 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <8 x float> [[TMP3]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP7]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R071]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP7]] ; ; AVX-LABEL: @test_v8f32( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> @@ -298,11 +302,31 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: @test_v4i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; SSE-LABEL: @test_v4i64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x i64> [[TMP7]] +; +; SLM-LABEL: @test_v4i64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x i64> [[TMP7]] +; +; AVX-LABEL: @test_v4i64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -324,11 +348,31 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @test_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @test_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @test_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX-LABEL: @test_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -368,19 +412,23 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> -; SSE-NEXT: ret <16 x i16> [[RV151]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: ret <16 x i16> [[TMP7]] ; ; SLM-LABEL: @test_v16i16( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SLM-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] -; SLM-NEXT: ret <16 x i16> [[TMP3]] +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: ret <16 x i16> [[TMP7]] ; ; AVX-LABEL: @test_v16i16( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll @@ -147,23 +147,23 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SSE-NEXT: ret <4 x double> [[R031]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[TMP7]] ; ; SLM-LABEL: @test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SLM-NEXT: ret <4 x double> [[R031]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x double> [[TMP7]] ; ; AVX-LABEL: @test_v4f64( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> @@ -192,20 +192,24 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <8 x float> [[TMP3]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP7]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R071]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP7]] ; ; AVX-LABEL: @test_v8f32( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> @@ -249,11 +253,31 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: @test_v4i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; SSE-LABEL: @test_v4i64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x i64> [[TMP7]] +; +; SLM-LABEL: @test_v4i64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x i64> [[TMP7]] +; +; AVX-LABEL: @test_v4i64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -275,11 +299,31 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @test_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @test_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @test_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX-LABEL: @test_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -319,19 +363,23 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> -; SSE-NEXT: ret <16 x i16> [[RV151]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: ret <16 x i16> [[TMP7]] ; ; SLM-LABEL: @test_v16i16( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SLM-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] -; SLM-NEXT: ret <16 x i16> [[TMP3]] +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: ret <16 x i16> [[TMP7]] ; ; AVX-LABEL: @test_v16i16( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll @@ -147,23 +147,23 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { ; SSE-LABEL: @test_v4f64( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SSE-NEXT: ret <4 x double> [[R031]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[TMP7]] ; ; SLM-LABEL: @test_v4f64( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R031:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> -; SLM-NEXT: ret <4 x double> [[R031]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x double> [[TMP7]] ; ; AVX-LABEL: @test_v4f64( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> @@ -192,20 +192,24 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { ; SSE-LABEL: @test_v8f32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <8 x float> [[TMP3]] +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP7]] ; ; SLM-LABEL: @test_v8f32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R071:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R071]] +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP7]] ; ; AVX-LABEL: @test_v8f32( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> @@ -249,11 +253,31 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: @test_v4i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; SSE-LABEL: @test_v4i64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x i64> [[TMP7]] +; +; SLM-LABEL: @test_v4i64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x i64> [[TMP7]] +; +; AVX-LABEL: @test_v4i64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -275,11 +299,31 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @test_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @test_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @test_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX-LABEL: @test_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -319,19 +363,23 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV151:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> -; SSE-NEXT: ret <16 x i16> [[RV151]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: ret <16 x i16> [[TMP7]] ; ; SLM-LABEL: @test_v16i16( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SLM-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] -; SLM-NEXT: ret <16 x i16> [[TMP3]] +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: ret <16 x i16> [[TMP7]] ; ; AVX-LABEL: @test_v16i16( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll @@ -28,15 +28,13 @@ ; ; SSE42-LABEL: @reduce_and4( ; SSE42-NEXT: entry: -; SSE42-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V4:%.*]]) -; SSE42-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V3:%.*]]) -; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0]], [[TMP1]] -; SSE42-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V2:%.*]]) -; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP2]] -; SSE42-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V1:%.*]]) -; SSE42-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP3]] -; SSE42-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[ACC:%.*]] -; SSE42-NEXT: ret i32 [[OP_RDX3]] +; SSE42-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE42-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; SSE42-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) +; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]] +; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] +; SSE42-NEXT: ret i32 [[OP_RDX1]] ; ; AVX-LABEL: @reduce_and4( ; AVX-NEXT: entry: @@ -103,15 +101,13 @@ ; SSE2-NEXT: ret i32 [[OP_RDX1]] ; ; SSE42-LABEL: @reduce_and4_transpose( -; SSE42-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V4:%.*]]) -; SSE42-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V3:%.*]]) -; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP2]] -; SSE42-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V2:%.*]]) -; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP3]] -; SSE42-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[V1:%.*]]) -; SSE42-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP4]] -; SSE42-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[ACC:%.*]] -; SSE42-NEXT: ret i32 [[OP_RDX3]] +; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE42-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE42-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) +; SSE42-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]] +; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] +; SSE42-NEXT: ret i32 [[OP_RDX1]] ; ; AVX-LABEL: @reduce_and4_transpose( ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32>