diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -39,6 +39,7 @@ //===----------------------------------------------------------------------===// #include "X86TargetTransformInfo.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" @@ -1186,6 +1187,77 @@ auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), LegalVT.getVectorNumElements()); + if (!Mask.empty() && NumOfDests.isValid()) { + // Try to perform better estimation of the permutation. + // 1. Split the source/destination vectors into real registers. + // 2. Do the mask analysis to identify which real registers are + // permuted. If more than 1 source registers are used for the + // destination register building, the cost for this destination register + // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one + // source register is used, build mask and calculate the cost as a cost + // of PermuteSingleSrc. + // Also, for the single register permute we try to identify if the + // destination register is just a copy of the source register or the + // copy of the previous destination register (the cost is + // TTI::TCC_Basic). If the source register is just reused, the cost for + // this operation is 0. + InstructionCost Cost = 0; + int Sz = Mask.size(); + unsigned E = *NumOfDests.getValue(); + unsigned SzDest = Sz / E; + unsigned SzSrc = Sz / NumOfSrcs; + unsigned PrevSrcReg = 0; + SmallVector PrevRegMask; + for (unsigned I = 0; I < E; ++I) { + SmallVector RegMask(SzDest, UndefMaskElem); + SmallBitVector UsedSrcRegs(NumOfSrcs, false); + // Check that the values in dest registers are in the one src + // register. + for (unsigned K = 0; K < SzDest; ++K) { + int Idx = I * SzDest + K; + if (Idx == Sz) + break; + if (Mask[Idx] >= Sz || Mask[Idx] == UndefMaskElem) + continue; + int SrcRegIdx = Mask[Idx] / SzSrc; + // Add a cost of PermuteTwoSrc for each new source register permute, + // if we have more than one source registers. + UsedSrcRegs.set(SrcRegIdx); + RegMask[K] = Mask[Idx] % SzSrc; + } + int NumSrcRegs = UsedSrcRegs.count(); + // NumSrcRegs might be 0 if the mask is just undef. + if (NumSrcRegs == 1) { + // Analysis of the single source register permutation. + unsigned SrcReg = UsedSrcRegs.find_first(); + if (!ShuffleVectorInst::isIdentityMask(RegMask)) { + // Check if the previous register can be just copied to the next + // one. + if (PrevRegMask.empty() || PrevSrcReg != SrcReg || + PrevRegMask != RegMask) + Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, + RegMask, 0, nullptr); + else + // Just a copy of previous destination register. + Cost += TTI::TCC_Basic; + } else if (SrcReg != I && any_of(RegMask, [](int I) { + return I != UndefMaskElem; + })) { + // Just a copy of the source register. + Cost += TTI::TCC_Basic; + } + PrevSrcReg = SrcReg; + PrevRegMask.swap(RegMask); + } else if (NumSrcRegs > 1) { + // TODO: Add analysis of regmask for better cost estimation. + Cost += + (NumSrcRegs - 1) * getShuffleCost(TTI::SK_PermuteTwoSrc, + SingleOpTy, None, 0, nullptr); + } + } + return Cost; + } + InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, None, 0, nullptr); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1579,9 +1579,7 @@ /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - InstructionCost - getGatherCost(FixedVectorType *Ty, - const DenseSet &ShuffledIndices) const; + InstructionCost getGatherCost(FixedVectorType *Ty, ArrayRef Mask) const; /// Checks if the gathered \p VL can be represented as shuffle(s) of previous /// tree entries. @@ -3570,6 +3568,7 @@ // Process extracts in blocks of EltsPerVector to check if the source vector // operand can be re-used directly. If not, add the cost of creating a shuffle // to extract the values into a vector register. + SmallVector RegMask(EltsPerVector, UndefMaskElem); for (auto *V : VL) { ++Idx; @@ -3585,6 +3584,7 @@ unsigned PrevIdx = *getExtractIndex(cast(VL[Idx - 1])); AllConsecutive &= PrevIdx + 1 == CurrentIdx && CurrentIdx % EltsPerVector == Idx % EltsPerVector; + RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector; if (AllConsecutive) continue; @@ -3598,7 +3598,8 @@ // cost to extract the a vector with EltsPerVector elements. Cost += TTI.getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, - FixedVectorType::get(VecTy->getElementType(), EltsPerVector)); + FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask); + RegMask.assign(EltsPerVector, UndefMaskElem); } return Cost; } @@ -3734,8 +3735,7 @@ // Found the broadcasting of the single scalar, calculate the cost as the // broadcast. return ReuseShuffleCost + - TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None, - 0); + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); } if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL)) { @@ -4169,7 +4169,7 @@ Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0); } VecCost += - TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0); + TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask); LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return ReuseShuffleCost + VecCost - ScalarCost; } @@ -4697,19 +4697,20 @@ return None; } -InstructionCost -BoUpSLP::getGatherCost(FixedVectorType *Ty, - const DenseSet &ShuffledIndices) const { - unsigned NumElts = Ty->getNumElements(); - APInt DemandedElts = APInt::getNullValue(NumElts); - for (unsigned I = 0; I < NumElts; ++I) - if (!ShuffledIndices.count(I)) - DemandedElts.setBit(I); +InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, + ArrayRef Mask) const { + int NumElts = Ty->getNumElements(); + APInt DemandedElts(NumElts, 0); + for_each(Mask, [&DemandedElts](int Idx) { + if (Idx > 0) + DemandedElts.setBit(Idx); + }); InstructionCost Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, /*Extract*/ false); - if (!ShuffledIndices.empty()) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); + if (!ShuffleVectorInst::isIdentityMask(Mask)) + Cost += + TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty, Mask); return Cost; } @@ -4722,17 +4723,17 @@ // Find the cost of inserting/extracting values from the vector. // Check if the same elements are inserted several times and count them as // shuffle candidates. - DenseSet ShuffledElements; - DenseSet UniqueElements; + DenseMap UniqueElements; + SmallVector Mask(VL.size(), UndefMaskElem); // Iterate in reverse order to consider insert elements with the high cost. for (unsigned I = VL.size(); I > 0; --I) { unsigned Idx = I - 1; if (isConstant(VL[Idx])) continue; - if (!UniqueElements.insert(VL[Idx]).second) - ShuffledElements.insert(Idx); + auto Res = UniqueElements.try_emplace(VL[Idx], Idx); + Mask[Idx] = Res.first->second; } - return getGatherCost(VecTy, ShuffledElements); + return getGatherCost(VecTy, Mask); } // Perform operand reordering on the instructions in VL and return the reordered diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll --- a/llvm/test/Analysis/CostModel/X86/reduction.ll +++ b/llvm/test/Analysis/CostModel/X86/reduction.ll @@ -63,11 +63,11 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { ; SSE-LABEL: 'reduction_cost_int' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r @@ -93,11 +93,11 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'reduction_cost_int' -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r @@ -418,25 +418,25 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSE2-LABEL: 'no_pairwise_reduction4double' -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'no_pairwise_reduction4double' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'no_pairwise_reduction4double' -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r @@ -458,9 +458,9 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SLM-LABEL: 'no_pairwise_reduction4double' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r @@ -476,31 +476,31 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE2-LABEL: 'no_pairwise_reduction8float' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'no_pairwise_reduction8float' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'no_pairwise_reduction8float' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r @@ -526,11 +526,11 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SLM-LABEL: 'no_pairwise_reduction8float' -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r @@ -600,9 +600,9 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; SSE-LABEL: 'no_pairwise_reduction4i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 ; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r @@ -624,9 +624,9 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SLM-LABEL: 'no_pairwise_reduction4i64' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r @@ -704,11 +704,11 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SSE-LABEL: 'no_pairwise_reduction8i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r @@ -734,11 +734,11 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'no_pairwise_reduction8i32' -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r @@ -862,31 +862,31 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSE2-LABEL: 'pairwise_reduction4double' -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'pairwise_reduction4double' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'pairwise_reduction4double' -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r @@ -912,11 +912,11 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SLM-LABEL: 'pairwise_reduction4double' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r @@ -934,40 +934,40 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE2-LABEL: 'pairwise_reduction8float' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'pairwise_reduction8float' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'pairwise_reduction8float' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r @@ -999,14 +999,14 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SLM-LABEL: 'pairwise_reduction8float' -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r @@ -1082,11 +1082,11 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; SSE-LABEL: 'pairwise_reduction4i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r @@ -1112,11 +1112,11 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SLM-LABEL: 'pairwise_reduction4i64' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r @@ -1214,14 +1214,14 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SSE-LABEL: 'pairwise_reduction8i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r @@ -1253,14 +1253,14 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'pairwise_reduction8i32' -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll b/llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll @@ -21,29 +21,29 @@ ; SSE-LABEL: 'test_vXf64' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; XOP-LABEL: 'test_vXf64' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXf64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf64' @@ -64,25 +64,25 @@ ; SSE-LABEL: 'test_vXi64' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; XOP-LABEL: 'test_vXi64' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi64' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi64' @@ -101,29 +101,29 @@ ; SSE-LABEL: 'test_vXf32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; XOP-LABEL: 'test_vXf32' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXf32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXf32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXf32' @@ -144,17 +144,17 @@ ; SSE-LABEL: 'test_vXi32' ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; XOP-LABEL: 'test_vXi32' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi32' @@ -162,15 +162,15 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_vXi32' @@ -194,27 +194,27 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi16' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi16' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; XOP-LABEL: 'test_vXi16' @@ -222,8 +222,8 @@ ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi16' @@ -231,8 +231,8 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi16' @@ -240,8 +240,8 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512F-LABEL: 'test_vXi16' @@ -250,7 +250,7 @@ ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'test_vXi16' @@ -286,8 +286,8 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSSE3-LABEL: 'test_vXi8' @@ -295,8 +295,8 @@ ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSSE3-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_vXi8' @@ -304,8 +304,8 @@ ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; XOP-LABEL: 'test_vXi8' @@ -314,7 +314,7 @@ ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_vXi8' @@ -323,7 +323,7 @@ ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_vXi8' @@ -332,7 +332,7 @@ ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512F-LABEL: 'test_vXi8' diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -slp-vectorizer -slp-threshold=-5 -S -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-7 | FileCheck %s --check-prefix=CHECK +; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-4 | FileCheck %s --check-prefix=CHECK ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-8 -slp-min-tree-size=6 | FileCheck %s --check-prefix=FORCE_REDUCTION define void @Test(i32) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll @@ -12,11 +12,12 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64 ; CHECK-NEXT: [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64 -; CHECK-NEXT: [[A_AND:%.*]] = and i64 [[A_CAST]], 42 -; CHECK-NEXT: [[B_AND:%.*]] = and i64 [[B_CAST]], 42 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[A_CAST]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[B_CAST]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[TMP1]], ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, i64* [[PTR:%.*]], i32 1 -; CHECK-NEXT: store i64 [[A_AND]], i64* [[PTR]], align 8 -; CHECK-NEXT: store i64 [[B_AND]], i64* [[GEP]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[PTR]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -103,16 +103,16 @@ ; CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 ; CHECK-NEXT: [[T3:%.*]] = bitcast float* [[T2]] to i64* ; CHECK-NEXT: [[T4:%.*]] = load i64, i64* [[T3]], align 8 -; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32 -; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float -; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> poison, float [[T6]], i32 0 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 -; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 -; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float -; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[T1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[T8]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[T111:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP5]], <4 x i32> ; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 ; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float -; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 +; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T111]], float [[T13]], i32 2 ; CHECK-NEXT: [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3 ; CHECK-NEXT: ret <4 x float> [[T15]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -103,16 +103,16 @@ ; CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 ; CHECK-NEXT: [[T3:%.*]] = bitcast float* [[T2]] to i64* ; CHECK-NEXT: [[T4:%.*]] = load i64, i64* [[T3]], align 8 -; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32 -; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float -; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 -; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 -; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float -; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[T1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[T8]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[T111:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP5]], <4 x i32> ; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 ; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float -; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 +; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T111]], float [[T13]], i32 2 ; CHECK-NEXT: [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3 ; CHECK-NEXT: ret <4 x float> [[T15]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx -S | FileCheck %s +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -56,35 +56,22 @@ ; ensure correctness. ; define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) { -; SSE-LABEL: @PR31243_sext( -; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1 -; SSE-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1 -; SSE-NEXT: [[TMP2:%.*]] = sext i8 [[TMP0]] to i64 -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]] -; SSE-NEXT: [[TMP3:%.*]] = sext i8 [[TMP1]] to i64 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]] -; SSE-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1 -; SSE-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1 -; SSE-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]] -; SSE-NEXT: ret i8 [[TMP8]] -; -; AVX-LABEL: @PR31243_sext( -; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 -; AVX-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], -; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16> -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i64 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]] -; AVX-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1 -; AVX-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i64 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]] -; AVX-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1 -; AVX-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1 -; AVX-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]] -; AVX-NEXT: ret i8 [[TMP8]] +; CHECK-LABEL: @PR31243_sext( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] ; entry: %tmp0 = sext i8 %v0 to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -17,16 +17,17 @@ ; SSE-NEXT: [[OR_1:%.*]] = or i64 undef, 1 ; SSE-NEXT: store i64 [[OR_1]], i64* undef, align 8 ; SSE-NEXT: [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 -; SSE-NEXT: [[FOO_3:%.*]] = load i64, i64* [[FOO_1]], align 8 ; SSE-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1 -; SSE-NEXT: [[FOO_4:%.*]] = load i64, i64* [[FOO_2]], align 8 +; SSE-NEXT: [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 ; SSE-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 -; SSE-NEXT: [[AND_2:%.*]] = and i64 [[OR_1]], [[FOO_3]] -; SSE-NEXT: [[AND_1:%.*]] = and i64 [[BAR5]], [[FOO_4]] +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]] ; SSE-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 -; SSE-NEXT: store i64 [[AND_2]], i64* [[BAR3]], align 8 ; SSE-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 -; SSE-NEXT: store i64 [[AND_1]], i64* [[BAR4]], align 8 +; SSE-NEXT: [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 ; SSE-NEXT: ret void ; ; AVX-LABEL: @_ZN1C10SwitchModeEv( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -244,32 +244,23 @@ ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_3( -; AVX512-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX512-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX512-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX512-NEXT: [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> +; AVX512-NEXT: [[TMP7:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX512-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32*> [[TMP7]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr i32, <4 x i32*> [[TMP8]], <4 x i64> +; AVX512-NEXT: [[TMP10:%.*]] = insertelement <8 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX512-NEXT: [[TMP11:%.*]] = insertelement <8 x i32*> [[TMP10]], i32* [[TMP3]], i32 1 +; AVX512-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP6]], <2 x i32*> poison, <8 x i32> +; AVX512-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32*> [[TMP11]], <8 x i32*> [[TMP12]], <8 x i32> +; AVX512-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32*> [[TMP9]], <4 x i32*> poison, <8 x i32> +; AVX512-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32*> [[TMP13]], <8 x i32*> [[TMP14]], <8 x i32> +; AVX512-NEXT: [[TMP16:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP15]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP17:%.*]] = add <8 x i32> [[TMP16]], +; AVX512-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512-NEXT: store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 @@ -426,32 +417,23 @@ ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_4( -; AVX512-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX512-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX512-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i32 0 +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32*> [[TMP1]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <2 x i32*> [[TMP2]], <2 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, <4 x i32*> [[TMP5]], <4 x i64> +; AVX512-NEXT: [[TMP7:%.*]] = insertelement <8 x i32*> poison, i32* [[T1]], i32 0 +; AVX512-NEXT: [[TMP8:%.*]] = insertelement <8 x i32*> [[TMP7]], i32* [[T6]], i32 1 +; AVX512-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <8 x i32> +; AVX512-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32*> [[TMP8]], <8 x i32*> [[TMP9]], <8 x i32> +; AVX512-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <8 x i32> +; AVX512-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32*> [[TMP10]], <8 x i32*> [[TMP11]], <8 x i32> +; AVX512-NEXT: [[TMP13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP12]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP14:%.*]] = add <8 x i32> [[TMP13]], +; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512-NEXT: store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -244,32 +244,23 @@ ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_3( -; AVX512-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX512-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX512-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX512-NEXT: [[TMP4:%.*]] = insertelement <2 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32*> [[TMP4]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> +; AVX512-NEXT: [[TMP7:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX512-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32*> [[TMP7]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr i32, <4 x i32*> [[TMP8]], <4 x i64> +; AVX512-NEXT: [[TMP10:%.*]] = insertelement <8 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX512-NEXT: [[TMP11:%.*]] = insertelement <8 x i32*> [[TMP10]], i32* [[TMP3]], i32 1 +; AVX512-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32*> [[TMP6]], <2 x i32*> poison, <8 x i32> +; AVX512-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32*> [[TMP11]], <8 x i32*> [[TMP12]], <8 x i32> +; AVX512-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32*> [[TMP9]], <4 x i32*> poison, <8 x i32> +; AVX512-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32*> [[TMP13]], <8 x i32*> [[TMP14]], <8 x i32> +; AVX512-NEXT: [[TMP16:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP15]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP17:%.*]] = add <8 x i32> [[TMP16]], +; AVX512-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX512-NEXT: store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 @@ -426,32 +417,23 @@ ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_4( -; AVX512-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX512-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX512-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <2 x i32*> poison, i32* [[T1]], i32 0 +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32*> [[TMP1]], <2 x i32*> poison, <2 x i32> zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <2 x i32*> [[TMP2]], <2 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i32 0 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, <4 x i32*> [[TMP5]], <4 x i64> +; AVX512-NEXT: [[TMP7:%.*]] = insertelement <8 x i32*> poison, i32* [[T1]], i32 0 +; AVX512-NEXT: [[TMP8:%.*]] = insertelement <8 x i32*> [[TMP7]], i32* [[T6]], i32 1 +; AVX512-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32*> [[TMP3]], <2 x i32*> poison, <8 x i32> +; AVX512-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32*> [[TMP8]], <8 x i32*> [[TMP9]], <8 x i32> +; AVX512-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <8 x i32> +; AVX512-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32*> [[TMP10]], <8 x i32*> [[TMP11]], <8 x i32> +; AVX512-NEXT: [[TMP13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP12]], i32 4, <8 x i1> , <8 x i32> undef), !tbaa [[TBAA0]] +; AVX512-NEXT: [[TMP14:%.*]] = add <8 x i32> [[TMP13]], +; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX512-NEXT: store <8 x i32> [[TMP14]], <8 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll @@ -43,7 +43,7 @@ ; YAML-NEXT: Function: fextr ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-4' +; YAML-NEXT: - Cost: '-5' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '4' diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -27,7 +27,6 @@ ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] -; CHECK-NEXT: [[T28:%.*]] = add nsw i32 [[T15]], [[T9]] ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] ; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 @@ -42,14 +41,20 @@ ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]] -; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[T28]], i32 0 -; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1 ; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2 ; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 -; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[T28]], i32 4 -; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[T50]], i32 5 -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 ; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -27,7 +27,6 @@ ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] -; CHECK-NEXT: [[T28:%.*]] = add nsw i32 [[T15]], [[T9]] ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] ; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 @@ -42,14 +41,20 @@ ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]] -; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[T28]], i32 0 -; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1 ; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2 ; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 -; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[T28]], i32 4 -; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[T50]], i32 5 -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 ; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*