diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1607,10 +1607,12 @@ Value *vectorizeTree(ArrayRef VL); /// \returns the scalarization cost for this type. Scalarization in this - /// context means the creation of vectors from a group of scalars. - InstructionCost - getGatherCost(FixedVectorType *Ty, - const DenseSet &ShuffledIndices) const; + /// context means the creation of vectors from a group of scalars. If \p + /// NeedToShuffle is true, need to add a cost of reshuffling some of the + /// vector elements. + InstructionCost getGatherCost(FixedVectorType *Ty, + const DenseSet &ShuffledIndices, + bool NeedToShuffle) const; /// Checks if the gathered \p VL can be represented as shuffle(s) of previous /// tree entries. @@ -5580,7 +5582,8 @@ InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, - const DenseSet &ShuffledIndices) const { + const DenseSet &ShuffledIndices, + bool NeedToShuffle) const { unsigned NumElts = Ty->getNumElements(); APInt DemandedElts = APInt::getZero(NumElts); for (unsigned I = 0; I < NumElts; ++I) @@ -5589,7 +5592,7 @@ InstructionCost Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, /*Extract*/ false); - if (!ShuffledIndices.empty()) + if (NeedToShuffle) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; } @@ -5600,6 +5603,7 @@ if (StoreInst *SI = dyn_cast(VL[0])) ScalarTy = SI->getValueOperand()->getType(); auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + bool DuplicateNonConst = false; // Find the cost of inserting/extracting values from the vector. // Check if the same elements are inserted several times and count them as // shuffle candidates. @@ -5608,12 +5612,17 @@ // Iterate in reverse order to consider insert elements with the high cost. for (unsigned I = VL.size(); I > 0; --I) { unsigned Idx = I - 1; - if (isConstant(VL[Idx])) + // No need to shuffle duplicates for constants. + if (isConstant(VL[Idx])) { + ShuffledElements.insert(Idx); continue; - if (!UniqueElements.insert(VL[Idx]).second) + } + if (!UniqueElements.insert(VL[Idx]).second) { + DuplicateNonConst = true; ShuffledElements.insert(Idx); + } } - return getGatherCost(VecTy, ShuffledElements); + return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst); } // Perform operand reordering on the instructions in VL and return the reordered diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -slp-vectorizer -slp-threshold=-5 -S -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-7 | FileCheck %s --check-prefix=CHECK +; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-6 | FileCheck %s --check-prefix=CHECK ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-8 -slp-min-tree-size=6 | FileCheck %s --check-prefix=FORCE_REDUCTION define void @Test(i32) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -343,22 +343,19 @@ ; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 ; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X3]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], -; CHECK-NEXT: [[D0:%.*]] = icmp slt i32 [[X0]], [[Y0]] -; CHECK-NEXT: [[D1:%.*]] = icmp slt i32 [[X1]], [[Y1]] -; CHECK-NEXT: [[D2:%.*]] = icmp slt i32 [[X2]], [[Y2]] -; CHECK-NEXT: [[D3:%.*]] = icmp slt i32 [[X3]], [[Y3]] -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[S4:%.*]] = select i1 [[TMP7]], i1 [[D0]], i1 false -; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; CHECK-NEXT: ret i1 [[S7]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[X2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[X3]], i32 3 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> , i32 [[Y0]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y1]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[Y2]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[Y3]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = freeze <8 x i1> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP10]]) +; CHECK-NEXT: ret i1 [[TMP11]] ; %x0 = extractelement <8 x i32> %x, i32 0 %x1 = extractelement <8 x i32> %x, i32 1