diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -546,7 +546,6 @@ return std::nullopt; return *EI->idx_begin(); } - /// Tries to find extractelement instructions with constant indices from fixed /// vector type and gather such instructions into a bunch, which highly likely /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was @@ -658,6 +657,36 @@ return Res; } +/// Tries to find extractelement instructions with constant indices from fixed +/// vector type and gather such instructions into a bunch, which highly likely +/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was +/// successful, the matched scalars are replaced by poison values in \p VL for +/// future analysis. +static SmallVector> +tryToGatherExtractElements(SmallVectorImpl &VL, + SmallVectorImpl &Mask, unsigned NumParts) { + assert(NumParts > 0 && "NumParts expected be greater than or equal to 1."); + SmallVector> ShufflesRes(NumParts); + Mask.assign(VL.size(), PoisonMaskElem); + unsigned SliceSize = VL.size() / NumParts; + for (unsigned Part = 0; Part < NumParts; ++Part) { + // Scan list of gathered scalars for extractelements that can be represented + // as shuffles. + MutableArrayRef SubVL = + MutableArrayRef(VL).slice(Part * SliceSize, SliceSize); + SmallVector SubMask; + std::optional Res = + tryToGatherSingleRegisterExtractElements(SubVL, SubMask); + ShufflesRes[Part] = Res; + copy(SubMask, std::next(Mask.begin(), Part * SliceSize)); + } + if (none_of(ShufflesRes, [](const std::optional &Res) { + return Res.has_value(); + })) + ShufflesRes.clear(); + return ShufflesRes; +} + namespace { /// Main data required for vectorization of instructions. @@ -7116,101 +7145,80 @@ : R.getGatherCost(Gathers, !Root && VL.equals(Gathers))); }; - /// Compute the cost of creating a vector of type \p VecTy containing the - /// extracted values from \p VL. - InstructionCost computeExtractCost(ArrayRef VL, ArrayRef Mask, - TTI::ShuffleKind ShuffleKind) { - unsigned NumElts = 0; - for (Value *V : VL) { - auto *EE = dyn_cast(V); - if (!EE) - continue; - auto *VecTy = cast(EE->getVectorOperandType()); - NumElts = std::max(NumElts, VecTy->getNumElements()); - } - assert(NumElts > 0 && - "Expected at least 1-element fixed length vector(s)."); - auto *VecTy = FixedVectorType::get(VL.front()->getType(), NumElts); - unsigned NumOfParts = TTI.getNumberOfParts(VecTy); - if (!NumOfParts || NumElts < NumOfParts) - return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); - unsigned EltsPerVector = PowerOf2Ceil(divideCeil(NumElts, NumOfParts)); - int ValNum = -1; - int ValIdx = -1; - // Check that if trying to permute 2 input vectors (which may result in - // several vector registers), each per-register subvector is the result of - // the permutation of 2 single registers. - if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc && - !all_of(enumerate(Mask), [&](auto &&Arg) { - if (Arg.value() == PoisonMaskElem) - return true; - int CurValNum = (Arg.value() % NumElts) / EltsPerVector; - int CurValIdx = Arg.index() / EltsPerVector; - if (ValIdx != CurValIdx) { - ValIdx = CurValIdx; - ValNum = CurValNum; - return true; - } - return CurValNum == ValNum; - })) - return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); - + /// Compute the cost of creating a vector containing the extracted values from + /// \p VL. + InstructionCost + computeExtractCost(ArrayRef VL, ArrayRef Mask, + ArrayRef> ShuffleKinds, + unsigned NumParts) { + assert(VL.size() > NumParts && "Unexpected scalarized shuffle."); + unsigned NumElts = + std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) { + auto *EE = dyn_cast(V); + if (!EE) + return Sz; + auto *VecTy = cast(EE->getVectorOperandType()); + return std::max(Sz, VecTy->getNumElements()); + }); + unsigned NumSrcRegs = TTI.getNumberOfParts( + FixedVectorType::get(VL.front()->getType(), NumElts)); + if (NumSrcRegs == 0) + NumSrcRegs = 1; + // FIXME: this must be moved to TTI for better estimation. + unsigned EltsPerVector = PowerOf2Ceil(std::max( + divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs))); + auto CheckPerRegistersShuffle = + [&](MutableArrayRef Mask) -> std::optional { + DenseSet RegIndices; + // Check that if trying to permute same single/2 input vectors. + TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc; + int FirstRegId = -1; + for (auto [Idx, I] : enumerate(Mask)) { + if (I == PoisonMaskElem) + continue; + int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector; + if (FirstRegId < 0) + FirstRegId = RegId; + RegIndices.insert(RegId); + if (RegIndices.size() > 2) + return std::nullopt; + if (RegIndices.size() == 2) + ShuffleKind = TTI::SK_PermuteTwoSrc; + I = (I % NumElts) % EltsPerVector + + (RegId == FirstRegId ? 0 : EltsPerVector); + } + return ShuffleKind; + }; InstructionCost Cost = 0; // Process extracts in blocks of EltsPerVector to check if the source vector // operand can be re-used directly. If not, add the cost of creating a // shuffle to extract the values into a vector register. - auto *RegisterVecTy = - FixedVectorType::get(VL.front()->getType(), EltsPerVector); - SmallVector RegMask(EltsPerVector, PoisonMaskElem); - TTI::ShuffleKind RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc; - Value *VecBase = nullptr; - bool IsIdentity = true; - for (auto [Idx, V] : enumerate(VL)) { - // Reached the start of a new vector registers. - if (Idx % EltsPerVector == 0) { - RegMask.assign(EltsPerVector, PoisonMaskElem); - RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc; - VecBase = nullptr; - } - - // Need to exclude undefs from analysis. - if (isa(V) || Mask[Idx] == PoisonMaskElem) - continue; - - // Check all extracts for a vector register on the target directly - // extract values in order. - unsigned CurrentIdx = *getExtractIndex(cast(V)); - unsigned PrevIdx = CurrentIdx; - if (Idx % EltsPerVector != 0 && !isa(VL[Idx - 1]) && - Mask[Idx - 1] != PoisonMaskElem) - PrevIdx = *getExtractIndex(cast(VL[Idx - 1])) + 1; - if (!VecBase) { - VecBase = cast(V)->getVectorOperand(); - RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector; - IsIdentity = CurrentIdx % EltsPerVector == Idx % EltsPerVector; - } else if (VecBase != cast(V)->getVectorOperand()) { - IsIdentity = false; - RegisterSK = TargetTransformInfo::SK_PermuteTwoSrc; - RegMask[Idx % EltsPerVector] = - CurrentIdx % EltsPerVector + EltsPerVector; - } else { - IsIdentity &= PrevIdx == CurrentIdx && - CurrentIdx % EltsPerVector == Idx % EltsPerVector; - RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector; - } - - if (IsIdentity) + for (unsigned Part = 0; Part < NumParts; ++Part) { + if (!ShuffleKinds[Part]) continue; - - // Skip all indices, except for the last index per vector block. - if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size()) + ArrayRef MaskSlice = + Mask.slice(Part * EltsPerVector, + (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0) + ? Mask.size() % EltsPerVector + : EltsPerVector); + SmallVector SubMask(EltsPerVector, PoisonMaskElem); + copy(MaskSlice, SubMask.begin()); + std::optional RegShuffleKind = + CheckPerRegistersShuffle(SubMask); + if (!RegShuffleKind) { + Cost += TTI.getShuffleCost( + *ShuffleKinds[Part], + FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice); continue; - - // If we have a series of extracts which are not consecutive and hence - // cannot re-use the source vector register directly, compute the shuffle - // cost to extract the vector with EltsPerVector elements. - Cost += TTI.getShuffleCost(RegisterSK, RegisterVecTy, RegMask); + } + if (*RegShuffleKind != TTI::SK_PermuteSingleSrc || + !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) { + Cost += TTI.getShuffleCost( + *RegShuffleKind, + FixedVectorType::get(VL.front()->getType(), EltsPerVector), + SubMask); + } } return Cost; } @@ -7385,98 +7393,87 @@ SmallPtrSetImpl &CheckedExtracts) : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R), CheckedExtracts(CheckedExtracts) {} - Value *adjustExtracts(const TreeEntry *E, ArrayRef Mask, - TTI::ShuffleKind ShuffleKind) { + Value *adjustExtracts(const TreeEntry *E, MutableArrayRef Mask, + ArrayRef> ShuffleKinds, + unsigned NumParts) { if (Mask.empty()) return nullptr; Value *VecBase = nullptr; ArrayRef VL = E->Scalars; - auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); // If the resulting type is scalarized, do not adjust the cost. - unsigned VecNumParts = TTI.getNumberOfParts(VecTy); - if (VecNumParts == VecTy->getNumElements()) + if (NumParts == VL.size()) return nullptr; - DenseMap ExtractVectorsTys; - for (auto [I, V] : enumerate(VL)) { - // Ignore non-extractelement scalars. - if (isa(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem)) - continue; - // If all users of instruction are going to be vectorized and this - // instruction itself is not going to be vectorized, consider this - // instruction as dead and remove its cost from the final cost of the - // vectorized tree. - // Also, avoid adjusting the cost for extractelements with multiple uses - // in different graph entries. - const TreeEntry *VE = R.getTreeEntry(V); - if (!CheckedExtracts.insert(V).second || - !R.areAllUsersVectorized(cast(V), &VectorizedVals) || - (VE && VE != E)) - continue; - auto *EE = cast(V); - VecBase = EE->getVectorOperand(); - std::optional EEIdx = getExtractIndex(EE); - if (!EEIdx) - continue; - unsigned Idx = *EEIdx; - if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) { - auto It = - ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; - It->getSecond() = std::min(It->second, Idx); - } - // Take credit for instruction that will become dead. - if (EE->hasOneUse()) { - Instruction *Ext = EE->user_back(); - if (isa(Ext) && all_of(Ext->users(), [](User *U) { - return isa(U); - })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), - EE->getVectorOperandType(), Idx); - // Add back the cost of s|zext which is subtracted separately. - Cost += TTI.getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EE->getType(), - TTI::getCastContextHint(Ext), CostKind, Ext); + // Check if it can be considered reused if same extractelements were + // vectorized already. + bool PrevNodeFound = any_of( + ArrayRef(R.VectorizableTree).take_front(E->Idx), + [&](const std::unique_ptr &TE) { + return ((!TE->isAltShuffle() && + TE->getOpcode() == Instruction::ExtractElement) || + TE->State == TreeEntry::NeedToGather) && + all_of(enumerate(TE->Scalars), [&](auto &&Data) { + return VL.size() > Data.index() && + (Mask[Data.index()] == PoisonMaskElem || + isa(VL[Data.index()]) || + Data.value() == VL[Data.index()]); + }); + }); + unsigned SliceSize = VL.size() / NumParts; + for (unsigned Part = 0; Part < NumParts; ++Part) { + ArrayRef SubMask = Mask.slice(Part * SliceSize, SliceSize); + for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) { + // Ignore non-extractelement scalars. + if (isa(V) || + (!SubMask.empty() && SubMask[I] == PoisonMaskElem)) continue; - } - } - Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, - Idx); - } - // Add a cost for subvector extracts/inserts if required. - for (const auto &Data : ExtractVectorsTys) { - auto *EEVTy = cast(Data.first->getType()); - unsigned NumElts = VecTy->getNumElements(); - if (Data.second % NumElts == 0) - continue; - if (TTI.getNumberOfParts(EEVTy) > VecNumParts) { - unsigned Idx = (Data.second / NumElts) * NumElts; - unsigned EENumElts = EEVTy->getNumElements(); - if (Idx % NumElts == 0) + // If all users of instruction are going to be vectorized and this + // instruction itself is not going to be vectorized, consider this + // instruction as dead and remove its cost from the final cost of the + // vectorized tree. + // Also, avoid adjusting the cost for extractelements with multiple uses + // in different graph entries. + const TreeEntry *VE = R.getTreeEntry(V); + if (!CheckedExtracts.insert(V).second || + !R.areAllUsersVectorized(cast(V), &VectorizedVals) || + (VE && VE != E)) continue; - if (Idx + NumElts <= EENumElts) { - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, VecTy); - } else { - // Need to round up the subvector type vectorization factor to avoid a - // crash in cost model functions. Make SubVT so that Idx + VF of SubVT - // <= EENumElts. - auto *SubVT = - FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, std::nullopt, CostKind, Idx, SubVT); + auto *EE = cast(V); + VecBase = EE->getVectorOperand(); + std::optional EEIdx = getExtractIndex(EE); + if (!EEIdx) + continue; + unsigned Idx = *EEIdx; + // Take credit for instruction that will become dead. + if (EE->hasOneUse() || !PrevNodeFound) { + Instruction *Ext = EE->user_back(); + if (isa(Ext) && all_of(Ext->users(), [](User *U) { + return isa(U); + })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + Cost -= + TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), + EE->getVectorOperandType(), Idx); + // Add back the cost of s|zext which is subtracted separately. + Cost += TTI.getCastInstrCost( + Ext->getOpcode(), Ext->getType(), EE->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + continue; + } } - } else { - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, - VecTy, std::nullopt, CostKind, 0, EEVTy); + Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), + CostKind, Idx); } } + // Done for reused if same extractelements were vectorized already. + if (PrevNodeFound) + return VecBase; // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. // Found the bunch of extractelement instructions that must be gathered // into a vector and can be represented as a permutation elements in a // single input vector or of 2 input vectors. - Cost += computeExtractCost(VL, Mask, ShuffleKind); + Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts); return VecBase; } void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef Mask) { @@ -7555,7 +7552,7 @@ assert((IsFinalized || CommonMask.empty()) && "Shuffle construction must be finalized."); } -}; + }; InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, @@ -7612,29 +7609,31 @@ reorderScalars(GatheredScalars, ReorderMask); SmallVector Mask; SmallVector ExtractMask; - std::optional ExtractShuffle; std::optional GatherShuffle; SmallVector Entries; // Check for gathered extracts. - ExtractShuffle = tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask); - SmallVector IgnoredVals; - if (UserIgnoreList) - IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); - bool Resized = false; - if (Value *VecBase = Estimator.adjustExtracts( - E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) - if (auto *VecBaseTy = dyn_cast(VecBase->getType())) - if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { - Resized = true; - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } + unsigned NumParts = TTI->getNumberOfParts(VecTy); + if (NumParts == 0) + NumParts = 1; + SmallVector> ExtractShuffles = + tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); + if (!ExtractShuffles.empty()) { + if (Value *VecBase = Estimator.adjustExtracts(E, ExtractMask, + ExtractShuffles, NumParts)) + if (auto *VecBaseTy = dyn_cast(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && + GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + } // Do not try to look for reshuffled loads for gathered loads (they will be // handled later), for vectorized scalars, and cases, which are definitely // not profitable (splats and small gather nodes.) - if (ExtractShuffle || E->getOpcode() != Instruction::Load || + if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || E->isAltShuffle() || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || @@ -7700,7 +7699,7 @@ std::iota(ReuseMask.begin(), ReuseMask.end(), 0); Estimator.add(BV, ReuseMask); } - if (ExtractShuffle) + if (!ExtractShuffles.empty()) Estimator.add(E, std::nullopt); return Estimator.finalize(E->ReuseShuffleIndices); } @@ -9806,7 +9805,10 @@ : Builder(Builder), R(R) {} /// Adjusts extractelements after reusing them. - Value *adjustExtracts(const TreeEntry *E, ArrayRef Mask) { + Value *adjustExtracts(const TreeEntry *E, MutableArrayRef Mask, + unsigned NumParts, bool &UseVecBaseAsInput) { + UseVecBaseAsInput = false; + SmallPtrSet UniqueBases; Value *VecBase = nullptr; for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { int Idx = Mask[I]; @@ -9814,6 +9816,7 @@ continue; auto *EI = cast(E->Scalars[I]); VecBase = EI->getVectorOperand(); + UniqueBases.insert(VecBase); // If the only one use is vectorized - can delete the extractelement // itself. if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) { @@ -9822,7 +9825,75 @@ continue; R.eraseInstruction(EI); } - return VecBase; + if (NumParts == 1 || UniqueBases.size() == 1) + return VecBase; + UseVecBaseAsInput = true; + auto TransformToIdentity = [](MutableArrayRef Mask) { + for (auto [I, Idx] : enumerate(Mask)) + if (Idx != PoisonMaskElem) + Idx = I; + }; + // Perform multi-register vector shuffle, joining them into a single virtual + // long vector. + // Need to shuffle each part independently and then insert all this parts + // into a long virtual vector register, forming the original vector. + Value *Vec = nullptr; + SmallVector VecMask(Mask.size(), PoisonMaskElem); + unsigned SliceSize = E->Scalars.size() / NumParts; + for (unsigned Part = 0; Part < NumParts; ++Part) { + ArrayRef VL = + ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize); + MutableArrayRef SubMask = Mask.slice(Part * SliceSize, SliceSize); + constexpr int MaxBases = 2; + SmallVector Bases(MaxBases); +#ifndef NDEBUG + int PrevSize = 0; +#endif // NDEBUG + for (const auto [I, V]: enumerate(VL)) { + if (SubMask[I] == PoisonMaskElem) + continue; + Value *VecOp = cast(V)->getVectorOperand(); + const int Size = + cast(VecOp->getType())->getNumElements(); +#ifndef NDEBUG + assert((PrevSize == Size || PrevSize == 0) && + "Expected vectors of the same size."); + PrevSize = Size; +#endif // NDEBUG + Bases[SubMask[I] < Size ? 0 : 1] = VecOp; + } + if (!Bases.front()) + continue; + Value *SubVec; + if (Bases.back()) { + SubVec = createShuffle(Bases.front(), Bases.back(), SubMask); + TransformToIdentity(SubMask); + } else { + SubVec = Bases.front(); + } + if (!Vec) { + Vec = SubVec; + copy(SubMask, VecMask.begin()); + } else { + unsigned VF = cast(Vec->getType())->getNumElements(); + if (Vec->getType() != SubVec->getType()) { + unsigned SubVecVF = + cast(SubVec->getType())->getNumElements(); + if (VF < SubVecVF) + TransformToIdentity(VecMask); + VF = std::max(VF, SubVecVF); + } + // Adjust SubMask. + for (auto [I, Idx] : enumerate(SubMask)) + if (Idx != PoisonMaskElem) + Idx += VF; + copy(SubMask, std::next(VecMask.begin(), Part * SliceSize)); + Vec = createShuffle(Vec, SubVec, VecMask); + TransformToIdentity(VecMask); + } + } + copy(VecMask, Mask.begin()); + return Vec; } /// Checks if the specified entry \p E needs to be delayed because of its /// dependency nodes. @@ -10159,28 +10230,37 @@ BVTy ShuffleBuilder(Params...); ResTy Res = ResTy(); SmallVector Mask; - SmallVector ExtractMask; - std::optional ExtractShuffle; + SmallVector ExtractMask(GatheredScalars.size(), PoisonMaskElem); + SmallVector> ExtractShuffles; + Value *ExtractVecBase = nullptr; + bool UseVecBaseAsInput; std::optional GatherShuffle; SmallVector Entries; Type *ScalarTy = GatheredScalars.front()->getType(); if (!all_of(GatheredScalars, UndefValue::classof)) { // Check for gathered extracts. - ExtractShuffle = - tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask); - SmallVector IgnoredVals; - if (UserIgnoreList) - IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); bool Resized = false; - if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask)) - if (auto *VecBaseTy = dyn_cast(VecBase->getType())) - if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { - Resized = true; - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } + unsigned NumParts = TTI->getNumberOfParts( + FixedVectorType::get(ScalarTy, GatheredScalars.size())); + if (NumParts == 0) + NumParts = 1; + ExtractShuffles = + tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts); + if (!ExtractShuffles.empty()) { + if (Value *VecBase = ShuffleBuilder.adjustExtracts( + E, ExtractMask, NumParts, UseVecBaseAsInput)) { + ExtractVecBase = VecBase; + if (auto *VecBaseTy = dyn_cast(VecBase->getType())) + if (VF == VecBaseTy->getNumElements() && + GatheredScalars.size() != VF) { + Resized = true; + GatheredScalars.append(VF - GatheredScalars.size(), + PoisonValue::get(ScalarTy)); + } + } + } // Gather extracts after we check for full matched gathers only. - if (ExtractShuffle || E->getOpcode() != Instruction::Load || + if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || E->isAltShuffle() || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || @@ -10328,30 +10408,35 @@ } } }; - if (ExtractShuffle || GatherShuffle) { + if (!ExtractShuffles.empty() || GatherShuffle) { bool IsNonPoisoned = true; bool IsUsedInExpr = false; Value *Vec1 = nullptr; - if (ExtractShuffle) { + if (!ExtractShuffles.empty()) { // Gather of extractelements can be represented as just a shuffle of // a single/two vectors the scalars are extracted from. // Find input vectors. Value *Vec2 = nullptr; for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { - if (ExtractMask[I] == PoisonMaskElem || - (!Mask.empty() && Mask[I] != PoisonMaskElem)) { + if (!Mask.empty() && Mask[I] != PoisonMaskElem) ExtractMask[I] = PoisonMaskElem; - continue; - } - if (isa(E->Scalars[I])) - continue; - auto *EI = cast(E->Scalars[I]); - if (!Vec1) { - Vec1 = EI->getVectorOperand(); - } else if (Vec1 != EI->getVectorOperand()) { - assert((!Vec2 || Vec2 == EI->getVectorOperand()) && - "Expected only 1 or 2 vectors shuffle."); - Vec2 = EI->getVectorOperand(); + } + if (UseVecBaseAsInput) { + Vec1 = ExtractVecBase; + } else { + for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { + if (ExtractMask[I] == PoisonMaskElem) + continue; + if (isa(E->Scalars[I])) + continue; + auto *EI = cast(E->Scalars[I]); + if (!Vec1) { + Vec1 = EI->getVectorOperand(); + } else if (Vec1 != EI->getVectorOperand()) { + assert((!Vec2 || Vec2 == EI->getVectorOperand()) && + "Expected only 1 or 2 vectors shuffle."); + Vec2 = EI->getVectorOperand(); + } } } if (Vec2) { @@ -10390,10 +10475,14 @@ int MSz = Mask.size(); // Try to build constant vector and shuffle with it only if currently we // have a single permutation and more than 1 scalar constants. - bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle; + bool IsSingleShuffle = ExtractShuffles.empty() || !GatherShuffle; bool IsIdentityShuffle = - (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) == - TTI::SK_PermuteSingleSrc && + ((UseVecBaseAsInput || + all_of(ExtractShuffles, + [](const std::optional &SK) { + return SK.value_or(TTI::SK_PermuteTwoSrc) == + TTI::SK_PermuteSingleSrc; + })) && none_of(ExtractMask, [&](int I) { return I >= EMSz; }) && ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) || (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) == diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -75,64 +75,47 @@ ; CHECK-NEXT: [[TMP4TT_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_END_LOOPEXIT]] ] ; CHECK-NEXT: [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ] ; CHECK-NEXT: [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ] -; CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 0 -; CHECK-NEXT: [[VGETQ_LANE45:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 1 -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[VGETQ_LANE]], [[VGETQ_LANE45]] -; CHECK-NEXT: [[CONV48:%.*]] = trunc i64 [[ADD]] to i32 -; CHECK-NEXT: [[VGETQ_LANE51:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 0 -; CHECK-NEXT: [[VGETQ_LANE55:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 1 -; CHECK-NEXT: [[ADD57:%.*]] = add i64 [[VGETQ_LANE51]], [[VGETQ_LANE55]] -; CHECK-NEXT: [[CONV60:%.*]] = trunc i64 [[ADD57]] to i32 -; CHECK-NEXT: [[VGETQ_LANE63:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 0 -; CHECK-NEXT: [[VGETQ_LANE67:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 1 -; CHECK-NEXT: [[ADD69:%.*]] = add i64 [[VGETQ_LANE63]], [[VGETQ_LANE67]] -; CHECK-NEXT: [[CONV72:%.*]] = trunc i64 [[ADD69]] to i32 -; CHECK-NEXT: [[VGETQ_LANE75:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 0 -; CHECK-NEXT: [[VGETQ_LANE79:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 1 -; CHECK-NEXT: [[ADD81:%.*]] = add i64 [[VGETQ_LANE75]], [[VGETQ_LANE79]] -; CHECK-NEXT: [[CONV84:%.*]] = trunc i64 [[ADD81]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32> ; CHECK-NEXT: [[AND:%.*]] = and i32 [[NUMBEROFBOOLS]], 127 ; CHECK-NEXT: [[CMP86284:%.*]] = icmp ugt i32 [[AND]], 31 ; CHECK-NEXT: br i1 [[CMP86284]], label [[WHILE_BODY88:%.*]], label [[WHILE_END122:%.*]] ; CHECK: while.body88: ; CHECK-NEXT: [[PA_ADDR_1291:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_END121:%.*]] ], [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ] ; CHECK-NEXT: [[PB_ADDR_1290:%.*]] = phi ptr [ [[INCDEC_PTR89:%.*]], [[WHILE_END121]] ], [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ] -; CHECK-NEXT: [[_CTT_0289:%.*]] = phi i32 [ [[ADD99:%.*]], [[WHILE_END121]] ], [ [[CONV48]], [[WHILE_END]] ] -; CHECK-NEXT: [[_CFF_0288:%.*]] = phi i32 [ [[ADD106:%.*]], [[WHILE_END121]] ], [ [[CONV60]], [[WHILE_END]] ] -; CHECK-NEXT: [[_CTF_0287:%.*]] = phi i32 [ [[ADD113:%.*]], [[WHILE_END121]] ], [ [[CONV72]], [[WHILE_END]] ] -; CHECK-NEXT: [[_CFT_0286:%.*]] = phi i32 [ [[ADD120:%.*]], [[WHILE_END121]] ], [ [[CONV84]], [[WHILE_END]] ] ; CHECK-NEXT: [[NBBOOLBLOCK_1285:%.*]] = phi i32 [ [[SUB:%.*]], [[WHILE_END121]] ], [ [[AND]], [[WHILE_END]] ] -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP34:%.*]], [[WHILE_END121]] ], [ [[TMP17]], [[WHILE_END]] ] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4 ; CHECK-NEXT: br label [[WHILE_BODY93:%.*]] ; CHECK: while.body93: -; CHECK-NEXT: [[_CTT_1283:%.*]] = phi i32 [ [[_CTT_0289]], [[WHILE_BODY88]] ], [ [[ADD99]], [[WHILE_BODY93]] ] -; CHECK-NEXT: [[_CFF_1282:%.*]] = phi i32 [ [[_CFF_0288]], [[WHILE_BODY88]] ], [ [[ADD106]], [[WHILE_BODY93]] ] -; CHECK-NEXT: [[_CTF_1281:%.*]] = phi i32 [ [[_CTF_0287]], [[WHILE_BODY88]] ], [ [[ADD113]], [[WHILE_BODY93]] ] -; CHECK-NEXT: [[_CFT_1280:%.*]] = phi i32 [ [[_CFT_0286]], [[WHILE_BODY88]] ], [ [[ADD120]], [[WHILE_BODY93]] ] -; CHECK-NEXT: [[A_0279:%.*]] = phi i32 [ [[TMP10]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ] -; CHECK-NEXT: [[B_0278:%.*]] = phi i32 [ [[TMP11]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ] +; CHECK-NEXT: [[A_0279:%.*]] = phi i32 [ [[TMP19]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ] +; CHECK-NEXT: [[B_0278:%.*]] = phi i32 [ [[TMP20]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ] ; CHECK-NEXT: [[SHIFT_0277:%.*]] = phi i32 [ 0, [[WHILE_BODY88]] ], [ [[INC:%.*]], [[WHILE_BODY93]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi <4 x i32> [ [[TMP18]], [[WHILE_BODY88]] ], [ [[TMP34]], [[WHILE_BODY93]] ] ; CHECK-NEXT: [[AND94:%.*]] = and i32 [[A_0279]], 1 ; CHECK-NEXT: [[AND95:%.*]] = and i32 [[B_0278]], 1 ; CHECK-NEXT: [[SHR96]] = lshr i32 [[A_0279]], 1 ; CHECK-NEXT: [[SHR97]] = lshr i32 [[B_0278]], 1 -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[AND94]], 0 -; CHECK-NEXT: [[TOBOOL98:%.*]] = icmp ne i32 [[AND95]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL98]], i1 false -; CHECK-NEXT: [[LAND_EXT:%.*]] = zext i1 [[TMP12]] to i32 -; CHECK-NEXT: [[ADD99]] = add i32 [[_CTT_1283]], [[LAND_EXT]] -; CHECK-NEXT: [[TOBOOL100:%.*]] = icmp eq i32 [[AND94]], 0 -; CHECK-NEXT: [[TOBOOL103:%.*]] = icmp eq i32 [[AND95]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL103]], i1 false -; CHECK-NEXT: [[LAND_EXT105:%.*]] = zext i1 [[TMP13]] to i32 -; CHECK-NEXT: [[ADD106]] = add i32 [[_CFF_1282]], [[LAND_EXT105]] -; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL103]], i1 false -; CHECK-NEXT: [[LAND_EXT112:%.*]] = zext i1 [[TMP14]] to i32 -; CHECK-NEXT: [[ADD113]] = add i32 [[_CTF_1281]], [[LAND_EXT112]] -; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL98]], i1 false -; CHECK-NEXT: [[LAND_EXT119:%.*]] = zext i1 [[TMP15]] to i32 -; CHECK-NEXT: [[ADD120]] = add i32 [[_CFT_1280]], [[LAND_EXT119]] +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[AND94]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32> +; CHECK-NEXT: [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[SHIFT_0277]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 32 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[WHILE_END121]], label [[WHILE_BODY93]] @@ -144,61 +127,53 @@ ; CHECK-NEXT: br i1 [[CMP86]], label [[WHILE_BODY88]], label [[WHILE_END122]] ; CHECK: while.end122: ; CHECK-NEXT: [[NBBOOLBLOCK_1_LCSSA:%.*]] = phi i32 [ [[AND]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_END121]] ] -; CHECK-NEXT: [[_CFT_0_LCSSA:%.*]] = phi i32 [ [[CONV84]], [[WHILE_END]] ], [ [[ADD120]], [[WHILE_END121]] ] -; CHECK-NEXT: [[_CTF_0_LCSSA:%.*]] = phi i32 [ [[CONV72]], [[WHILE_END]] ], [ [[ADD113]], [[WHILE_END121]] ] -; CHECK-NEXT: [[_CFF_0_LCSSA:%.*]] = phi i32 [ [[CONV60]], [[WHILE_END]] ], [ [[ADD106]], [[WHILE_END121]] ] -; CHECK-NEXT: [[_CTT_0_LCSSA:%.*]] = phi i32 [ [[CONV48]], [[WHILE_END]] ], [ [[ADD99]], [[WHILE_END121]] ] ; CHECK-NEXT: [[PB_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR89]], [[WHILE_END121]] ] ; CHECK-NEXT: [[PA_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR]], [[WHILE_END121]] ] +; CHECK-NEXT: [[TMP35:%.*]] = phi <4 x i32> [ [[TMP17]], [[WHILE_END]] ], [ [[TMP34]], [[WHILE_END121]] ] ; CHECK-NEXT: [[CMP130_NOT299:%.*]] = icmp eq i32 [[NBBOOLBLOCK_1_LCSSA]], 0 ; CHECK-NEXT: br i1 [[CMP130_NOT299]], label [[WHILE_END166:%.*]], label [[WHILE_BODY132_PREHEADER:%.*]] ; CHECK: while.body132.preheader: -; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4 +; CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4 ; CHECK-NEXT: [[SUB125:%.*]] = sub nuw nsw i32 32, [[NBBOOLBLOCK_1_LCSSA]] -; CHECK-NEXT: [[SHR128:%.*]] = lshr i32 [[TMP16]], [[SUB125]] -; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4 -; CHECK-NEXT: [[SHR126:%.*]] = lshr i32 [[TMP17]], [[SUB125]] +; CHECK-NEXT: [[SHR128:%.*]] = lshr i32 [[TMP36]], [[SUB125]] +; CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4 +; CHECK-NEXT: [[SHR126:%.*]] = lshr i32 [[TMP37]], [[SUB125]] ; CHECK-NEXT: br label [[WHILE_BODY132:%.*]] ; CHECK: while.body132: -; CHECK-NEXT: [[_CTT_2306:%.*]] = phi i32 [ [[ADD142:%.*]], [[WHILE_BODY132]] ], [ [[_CTT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] -; CHECK-NEXT: [[_CFF_2305:%.*]] = phi i32 [ [[ADD150:%.*]], [[WHILE_BODY132]] ], [ [[_CFF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] -; CHECK-NEXT: [[_CTF_2304:%.*]] = phi i32 [ [[ADD157:%.*]], [[WHILE_BODY132]] ], [ [[_CTF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] -; CHECK-NEXT: [[_CFT_2303:%.*]] = phi i32 [ [[ADD164:%.*]], [[WHILE_BODY132]] ], [ [[_CFT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] ; CHECK-NEXT: [[NBBOOLBLOCK_2302:%.*]] = phi i32 [ [[DEC165:%.*]], [[WHILE_BODY132]] ], [ [[NBBOOLBLOCK_1_LCSSA]], [[WHILE_BODY132_PREHEADER]] ] ; CHECK-NEXT: [[A_1301:%.*]] = phi i32 [ [[SHR135:%.*]], [[WHILE_BODY132]] ], [ [[SHR126]], [[WHILE_BODY132_PREHEADER]] ] ; CHECK-NEXT: [[B_1300:%.*]] = phi i32 [ [[SHR136:%.*]], [[WHILE_BODY132]] ], [ [[SHR128]], [[WHILE_BODY132_PREHEADER]] ] +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP51:%.*]], [[WHILE_BODY132]] ], [ [[TMP35]], [[WHILE_BODY132_PREHEADER]] ] ; CHECK-NEXT: [[AND133:%.*]] = and i32 [[A_1301]], 1 ; CHECK-NEXT: [[AND134:%.*]] = and i32 [[B_1300]], 1 ; CHECK-NEXT: [[SHR135]] = lshr i32 [[A_1301]], 1 ; CHECK-NEXT: [[SHR136]] = lshr i32 [[B_1300]], 1 -; CHECK-NEXT: [[TOBOOL137:%.*]] = icmp ne i32 [[AND133]], 0 -; CHECK-NEXT: [[TOBOOL139:%.*]] = icmp ne i32 [[AND134]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL139]], i1 false -; CHECK-NEXT: [[LAND_EXT141:%.*]] = zext i1 [[TMP18]] to i32 -; CHECK-NEXT: [[ADD142]] = add i32 [[_CTT_2306]], [[LAND_EXT141]] -; CHECK-NEXT: [[TOBOOL144:%.*]] = icmp eq i32 [[AND133]], 0 -; CHECK-NEXT: [[TOBOOL147:%.*]] = icmp eq i32 [[AND134]], 0 -; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL147]], i1 false -; CHECK-NEXT: [[LAND_EXT149:%.*]] = zext i1 [[TMP19]] to i32 -; CHECK-NEXT: [[ADD150]] = add i32 [[_CFF_2305]], [[LAND_EXT149]] -; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL147]], i1 false -; CHECK-NEXT: [[LAND_EXT156:%.*]] = zext i1 [[TMP20]] to i32 -; CHECK-NEXT: [[ADD157]] = add i32 [[_CTF_2304]], [[LAND_EXT156]] -; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL139]], i1 false -; CHECK-NEXT: [[LAND_EXT163:%.*]] = zext i1 [[TMP21]] to i32 -; CHECK-NEXT: [[ADD164]] = add i32 [[_CFT_2303]], [[LAND_EXT163]] +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <2 x i32> poison, i32 [[AND133]], i32 0 +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer +; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer +; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32> +; CHECK-NEXT: [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]] ; CHECK-NEXT: [[DEC165]] = add nsw i32 [[NBBOOLBLOCK_2302]], -1 ; CHECK-NEXT: [[CMP130_NOT:%.*]] = icmp eq i32 [[DEC165]], 0 ; CHECK-NEXT: br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]] ; CHECK: while.end166: -; CHECK-NEXT: [[_CFT_2_LCSSA:%.*]] = phi i32 [ [[_CFT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD164]], [[WHILE_BODY132]] ] -; CHECK-NEXT: [[_CTF_2_LCSSA:%.*]] = phi i32 [ [[_CTF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD157]], [[WHILE_BODY132]] ] -; CHECK-NEXT: [[_CFF_2_LCSSA:%.*]] = phi i32 [ [[_CFF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD150]], [[WHILE_BODY132]] ] -; CHECK-NEXT: [[_CTT_2_LCSSA:%.*]] = phi i32 [ [[_CTT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD142]], [[WHILE_BODY132]] ] -; CHECK-NEXT: store i32 [[_CTT_2_LCSSA]], ptr [[CTT:%.*]], align 4 -; CHECK-NEXT: store i32 [[_CFF_2_LCSSA]], ptr [[CFF:%.*]], align 4 -; CHECK-NEXT: store i32 [[_CTF_2_LCSSA]], ptr [[CTF:%.*]], align 4 -; CHECK-NEXT: store i32 [[_CFT_2_LCSSA]], ptr [[CFT:%.*]], align 4 +; CHECK-NEXT: [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ] +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2 +; CHECK-NEXT: store i32 [[TMP53]], ptr [[CTT:%.*]], align 4 +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3 +; CHECK-NEXT: store i32 [[TMP54]], ptr [[CFF:%.*]], align 4 +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1 +; CHECK-NEXT: store i32 [[TMP55]], ptr [[CTF:%.*]], align 4 +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0 +; CHECK-NEXT: store i32 [[TMP56]], ptr [[CFT:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll @@ -9,7 +9,7 @@ ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-3' +; YAML-NEXT: - Cost: '-4' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '10' ; YAML-NEXT: ... diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll @@ -166,11 +166,43 @@ ; define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { -; CHECK-LABEL: @test_v4f64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x double> [[TMP3]] +; SSE-LABEL: @test_v4f64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[TMP7]] +; +; SLM-LABEL: @test_v4f64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x double> [[TMP7]] +; +; AVX-LABEL: @test_v4f64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x double> [[TMP3]] +; +; AVX2-LABEL: @test_v4f64( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <4 x double> [[TMP3]] +; +; AVX512-LABEL: @test_v4f64( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <4 x double> [[TMP3]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -266,11 +298,43 @@ } define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @test_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @test_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP7]] +; +; SLM-LABEL: @test_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP7]] +; +; AVX-LABEL: @test_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x float> [[TMP3]] +; +; AVX2-LABEL: @test_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512-LABEL: @test_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -308,11 +372,43 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: @test_v4i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; SSE-LABEL: @test_v4i64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x i64> [[TMP7]] +; +; SLM-LABEL: @test_v4i64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x i64> [[TMP7]] +; +; AVX-LABEL: @test_v4i64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] +; +; AVX2-LABEL: @test_v4i64( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <4 x i64> [[TMP3]] +; +; AVX512-LABEL: @test_v4i64( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -334,11 +430,43 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @test_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @test_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @test_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX-LABEL: @test_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @test_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @test_v8i32( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -376,11 +504,43 @@ } define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: @test_v16i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; SSE-LABEL: @test_v16i16( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: ret <16 x i16> [[TMP7]] +; +; SLM-LABEL: @test_v16i16( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: ret <16 x i16> [[TMP7]] +; +; AVX-LABEL: @test_v16i16( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <16 x i16> [[TMP3]] +; +; AVX2-LABEL: @test_v16i16( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <16 x i16> [[TMP3]] +; +; AVX512-LABEL: @test_v16i16( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <16 x i16> [[TMP3]] ; %a0 = extractelement <16 x i16> %a, i32 0 %a1 = extractelement <16 x i16> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -166,11 +166,43 @@ ; define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { -; CHECK-LABEL: @test_v4f64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x double> [[TMP3]] +; SSE-LABEL: @test_v4f64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[TMP7]] +; +; SLM-LABEL: @test_v4f64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x double> [[TMP7]] +; +; AVX-LABEL: @test_v4f64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x double> [[TMP3]] +; +; AVX2-LABEL: @test_v4f64( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <4 x double> [[TMP3]] +; +; AVX512-LABEL: @test_v4f64( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <4 x double> [[TMP3]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -266,11 +298,43 @@ } define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @test_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @test_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP7]] +; +; SLM-LABEL: @test_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP7]] +; +; AVX-LABEL: @test_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x float> [[TMP3]] +; +; AVX2-LABEL: @test_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512-LABEL: @test_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -308,11 +372,43 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: @test_v4i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; SSE-LABEL: @test_v4i64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x i64> [[TMP7]] +; +; SLM-LABEL: @test_v4i64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x i64> [[TMP7]] +; +; AVX-LABEL: @test_v4i64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] +; +; AVX2-LABEL: @test_v4i64( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <4 x i64> [[TMP3]] +; +; AVX512-LABEL: @test_v4i64( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -334,11 +430,43 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @test_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @test_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @test_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX-LABEL: @test_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @test_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @test_v8i32( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -376,11 +504,43 @@ } define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: @test_v16i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; SSE-LABEL: @test_v16i16( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: ret <16 x i16> [[TMP7]] +; +; SLM-LABEL: @test_v16i16( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: ret <16 x i16> [[TMP7]] +; +; AVX-LABEL: @test_v16i16( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; AVX-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <16 x i16> [[TMP3]] +; +; AVX2-LABEL: @test_v16i16( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <16 x i16> [[TMP3]] +; +; AVX512-LABEL: @test_v16i16( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <16 x i16> [[TMP3]] ; %a0 = extractelement <16 x i16> %a, i32 0 %a1 = extractelement <16 x i16> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 ; ; 128-bit vectors @@ -145,11 +145,43 @@ ; define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { -; CHECK-LABEL: @test_v4f64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x double> [[TMP3]] +; SSE-LABEL: @test_v4f64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[TMP7]] +; +; SLM-LABEL: @test_v4f64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x double> [[TMP7]] +; +; AVX-LABEL: @test_v4f64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x double> [[TMP3]] +; +; AVX2-LABEL: @test_v4f64( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <4 x double> [[TMP3]] +; +; AVX512-LABEL: @test_v4f64( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <4 x double> [[TMP3]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -171,11 +203,43 @@ } define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @test_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @test_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP7]] +; +; SLM-LABEL: @test_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP7]] +; +; AVX-LABEL: @test_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x float> [[TMP3]] +; +; AVX2-LABEL: @test_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512-LABEL: @test_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -213,11 +277,43 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: @test_v4i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; SSE-LABEL: @test_v4i64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x i64> [[TMP7]] +; +; SLM-LABEL: @test_v4i64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x i64> [[TMP7]] +; +; AVX-LABEL: @test_v4i64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] +; +; AVX2-LABEL: @test_v4i64( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <4 x i64> [[TMP3]] +; +; AVX512-LABEL: @test_v4i64( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -239,11 +335,43 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @test_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @test_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @test_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX-LABEL: @test_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @test_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @test_v8i32( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -281,11 +409,43 @@ } define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: @test_v16i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; SSE-LABEL: @test_v16i16( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: ret <16 x i16> [[TMP7]] +; +; SLM-LABEL: @test_v16i16( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: ret <16 x i16> [[TMP7]] +; +; AVX-LABEL: @test_v16i16( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <16 x i16> [[TMP3]] +; +; AVX2-LABEL: @test_v16i16( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <16 x i16> [[TMP3]] +; +; AVX512-LABEL: @test_v16i16( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <16 x i16> [[TMP3]] ; %a0 = extractelement <16 x i16> %a, i32 0 %a1 = extractelement <16 x i16> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 ; ; 128-bit vectors @@ -145,11 +145,43 @@ ; define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) { -; CHECK-LABEL: @test_v4f64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x double> [[TMP3]] +; SSE-LABEL: @test_v4f64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x double> [[TMP7]] +; +; SLM-LABEL: @test_v4f64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x double> [[TMP7]] +; +; AVX-LABEL: @test_v4f64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x double> [[TMP3]] +; +; AVX2-LABEL: @test_v4f64( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <4 x double> [[TMP3]] +; +; AVX512-LABEL: @test_v4f64( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <4 x double> [[TMP3]] ; %a0 = extractelement <4 x double> %a, i32 0 %a1 = extractelement <4 x double> %a, i32 1 @@ -171,11 +203,43 @@ } define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: @test_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x float> [[TMP3]] +; SSE-LABEL: @test_v8f32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[TMP7]] +; +; SLM-LABEL: @test_v8f32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[TMP7]] +; +; AVX-LABEL: @test_v8f32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x float> [[TMP3]] +; +; AVX2-LABEL: @test_v8f32( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <8 x float> [[TMP3]] +; +; AVX512-LABEL: @test_v8f32( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <8 x float> [[TMP3]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 @@ -213,11 +277,43 @@ } define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: @test_v4i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; SSE-LABEL: @test_v4i64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SSE-NEXT: ret <4 x i64> [[TMP7]] +; +; SLM-LABEL: @test_v4i64( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: ret <4 x i64> [[TMP7]] +; +; AVX-LABEL: @test_v4i64( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x i64> [[TMP3]] +; +; AVX2-LABEL: @test_v4i64( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <4 x i64> [[TMP3]] +; +; AVX512-LABEL: @test_v4i64( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <4 x i64> [[TMP3]] ; %a0 = extractelement <4 x i64> %a, i32 0 %a1 = extractelement <4 x i64> %a, i32 1 @@ -239,11 +335,43 @@ } define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: @test_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; SSE-LABEL: @test_v8i32( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP7]] +; +; SLM-LABEL: @test_v8i32( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> +; SLM-NEXT: ret <8 x i32> [[TMP7]] +; +; AVX-LABEL: @test_v8i32( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX2-LABEL: @test_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <8 x i32> [[TMP3]] +; +; AVX512-LABEL: @test_v8i32( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <8 x i32> [[TMP3]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 @@ -281,11 +409,43 @@ } define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: @test_v16i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; SSE-LABEL: @test_v16i16( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SSE-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] +; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: ret <16 x i16> [[TMP7]] +; +; SLM-LABEL: @test_v16i16( +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> +; SLM-NEXT: [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]] +; SLM-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]] +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> +; SLM-NEXT: ret <16 x i16> [[TMP7]] +; +; AVX-LABEL: @test_v16i16( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; AVX-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <16 x i16> [[TMP3]] +; +; AVX2-LABEL: @test_v16i16( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <16 x i16> [[TMP3]] +; +; AVX512-LABEL: @test_v16i16( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] +; AVX512-NEXT: ret <16 x i16> [[TMP3]] ; %a0 = extractelement <16 x i16> %a, i32 0 %a1 = extractelement <16 x i16> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll @@ -2,23 +2,24 @@ ; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -pass-remarks-output=%t | FileCheck %s ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s -; YAML: --- !Passed +; YAML: --- !Missed ; YAML-NEXT: Pass: slp-vectorizer -; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Name: NotBeneficial ; YAML-NEXT: Function: g ; YAML-NEXT: Args: -; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-1' -; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '4' +; YAML-NEXT: - String: 'List vectorization was possible but not beneficial with cost ' +; YAML-NEXT: - Cost: '0' +; YAML-NEXT: - String: ' >= ' +; YAML-NEXT: - Treshold: '0' define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) { ; CHECK-LABEL: @g( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A:%.*]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[B:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: ret <2 x i32> [[TMP4]] +; CHECK-NEXT: [[X1:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 1 +; CHECK-NEXT: [[X1X1:%.*]] = mul i32 [[X1]], [[X1]] +; CHECK-NEXT: [[AB:%.*]] = mul i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x i32> poison, i32 [[X1X1]], i32 0 +; CHECK-NEXT: [[INS2:%.*]] = insertelement <2 x i32> [[INS1]], i32 [[AB]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[INS2]] ; %x1 = extractelement <2 x i32> %x, i32 1 %x1x1 = mul i32 %x1, %x1