diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1148,18 +1148,6 @@ /// Construct a vectorizable tree that starts at \p Roots. void buildTree(ArrayRef Roots); - /// Checks if the very first tree node is going to be vectorized. - bool isVectorizedFirstNode() const { - return !VectorizableTree.empty() && - VectorizableTree.front()->State == TreeEntry::Vectorize; - } - - /// Returns the main instruction for the very first node. - Instruction *getFirstNodeMainOp() const { - assert(!VectorizableTree.empty() && "No tree to get the first node from"); - return VectorizableTree.front()->getMainOp(); - } - /// Returns whether the root node has in-tree uses. bool doesRootHaveInTreeUses() const { return !VectorizableTree.empty() && @@ -13340,22 +13328,7 @@ // Estimate cost. InstructionCost TreeCost = V.getTreeCost(VL); InstructionCost ReductionCost = - getReductionCost(TTI, VL, ReduxWidth, RdxFMF); - if (V.isVectorizedFirstNode() && isa(VL.front())) { - Instruction *MainOp = V.getFirstNodeMainOp(); - for (Value *V : VL) { - auto *VI = dyn_cast(V); - // Add the costs of scalar GEP pointers, to be removed from the - // code. - if (!VI || VI == MainOp) - continue; - auto *Ptr = dyn_cast(VI->getPointerOperand()); - if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices()) - continue; - TreeCost -= TTI->getArithmeticInstrCost( - Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput); - } - } + getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF); InstructionCost Cost = TreeCost + ReductionCost; LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); if (!Cost.isValid()) @@ -13591,7 +13564,8 @@ /// Calculate the cost of a reduction. InstructionCost getReductionCost(TargetTransformInfo *TTI, ArrayRef ReducedVals, - unsigned ReduxWidth, FastMathFlags FMF) { + bool IsCmpSelMinMax, unsigned ReduxWidth, + FastMathFlags FMF) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Value *FirstReducedVal = ReducedVals.front(); Type *ScalarTy = FirstReducedVal->getType(); @@ -13600,6 +13574,35 @@ // If all of the reduced values are constant, the vector cost is 0, since // the reduction value can be calculated at the compile time. bool AllConsts = allConstant(ReducedVals); + auto EvaluateScalarCost = [&](function_ref GenCostFn) { + InstructionCost Cost = 0; + // Scalar cost is repeated for N-1 elements. + int Cnt = ReducedVals.size(); + for (Value *RdxVal : ReducedVals) { + if (Cnt == 1) + break; + --Cnt; + if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) { + Cost += GenCostFn(); + continue; + } + InstructionCost ScalarCost = 0; + for (User *U : RdxVal->users()) { + auto *RdxOp = cast(U); + if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) { + ScalarCost += TTI->getInstructionCost(RdxOp, CostKind); + continue; + } + ScalarCost = InstructionCost::getInvalid(); + break; + } + if (ScalarCost.isValid()) + Cost += ScalarCost; + else + Cost += GenCostFn(); + } + return Cost; + }; switch (RdxKind) { case RecurKind::Add: case RecurKind::Mul: @@ -13612,7 +13615,9 @@ if (!AllConsts) VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); - ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); + ScalarCost = EvaluateScalarCost([&]() { + return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); + }); break; } case RecurKind::FMax: @@ -13626,10 +13631,12 @@ /*IsUnsigned=*/false, CostKind); } CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); - ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, - SclCondTy, RdxPred, CostKind) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, - SclCondTy, RdxPred, CostKind); + ScalarCost = EvaluateScalarCost([&]() { + return TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy, + RdxPred, CostKind) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy, + RdxPred, CostKind); + }); break; } case RecurKind::SMax: @@ -13646,18 +13653,18 @@ IsUnsigned, CostKind); } CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); - ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, - SclCondTy, RdxPred, CostKind) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, - SclCondTy, RdxPred, CostKind); + ScalarCost = EvaluateScalarCost([&]() { + return TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, SclCondTy, + RdxPred, CostKind) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy, + RdxPred, CostKind); + }); break; } default: llvm_unreachable("Expected arithmetic or min/max reduction operation"); } - // Scalar cost is repeated for N-1 elements. - ScalarCost *= (ReduxWidth - 1); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost << " for reduction that starts with " << *FirstReducedVal << " (It is a splitting reduction)\n"); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE,SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE,SSE4 -; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX -; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT +; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT +; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT +; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -passes=slp-vectorizer -S -slp-threshold=-100 | FileCheck %s --check-prefixes=CHECK,THRESH @arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16 @@ -1113,41 +1113,18 @@ } define i64 @umax_intrinsic_rdx_v4i64(ptr %p0) { -; SSE2-LABEL: @umax_intrinsic_rdx_v4i64( -; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1 -; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2 -; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3 -; SSE2-NEXT: [[T0:%.*]] = load i64, ptr [[P0]], align 4 -; SSE2-NEXT: [[T1:%.*]] = load i64, ptr [[P1]], align 4 -; SSE2-NEXT: [[T2:%.*]] = load i64, ptr [[P2]], align 4 -; SSE2-NEXT: [[T3:%.*]] = load i64, ptr [[P3]], align 4 -; SSE2-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]]) -; SSE2-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]]) -; SSE2-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]]) -; SSE2-NEXT: ret i64 [[M]] -; -; SSE4-LABEL: @umax_intrinsic_rdx_v4i64( -; SSE4-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4 -; SSE4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]]) -; SSE4-NEXT: ret i64 [[TMP2]] -; -; AVX-LABEL: @umax_intrinsic_rdx_v4i64( -; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1 -; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2 -; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3 -; AVX-NEXT: [[T0:%.*]] = load i64, ptr [[P0]], align 4 -; AVX-NEXT: [[T1:%.*]] = load i64, ptr [[P1]], align 4 -; AVX-NEXT: [[T2:%.*]] = load i64, ptr [[P2]], align 4 -; AVX-NEXT: [[T3:%.*]] = load i64, ptr [[P3]], align 4 -; AVX-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]]) -; AVX-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]]) -; AVX-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]]) -; AVX-NEXT: ret i64 [[M]] -; -; AVX2-LABEL: @umax_intrinsic_rdx_v4i64( -; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4 -; AVX2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]]) -; AVX2-NEXT: ret i64 [[TMP2]] +; DEFAULT-LABEL: @umax_intrinsic_rdx_v4i64( +; DEFAULT-NEXT: [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1 +; DEFAULT-NEXT: [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2 +; DEFAULT-NEXT: [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3 +; DEFAULT-NEXT: [[T0:%.*]] = load i64, ptr [[P0]], align 4 +; DEFAULT-NEXT: [[T1:%.*]] = load i64, ptr [[P1]], align 4 +; DEFAULT-NEXT: [[T2:%.*]] = load i64, ptr [[P2]], align 4 +; DEFAULT-NEXT: [[T3:%.*]] = load i64, ptr [[P3]], align 4 +; DEFAULT-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]]) +; DEFAULT-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]]) +; DEFAULT-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]]) +; DEFAULT-NEXT: ret i64 [[M]] ; ; THRESH-LABEL: @umax_intrinsic_rdx_v4i64( ; THRESH-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4 @@ -1252,5 +1229,3 @@ %t14 = call i32 @llvm.umin.i32(i32 %t13, i32 93) ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; SSE: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4 +; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX @@ -22,10 +22,25 @@ } define i32 @smax_v4i32(i32) { -; CHECK-LABEL: @smax_v4i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16 -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) -; CHECK-NEXT: ret i32 [[TMP3]] +; SSE2-LABEL: @smax_v4i32( +; SSE2-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 +; SSE2-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 +; SSE2-NEXT: [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; SSE2-NEXT: [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4 +; SSE2-NEXT: [[TMP6:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]]) +; SSE2-NEXT: [[TMP7:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[TMP4]]) +; SSE2-NEXT: [[TMP8:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[TMP5]]) +; SSE2-NEXT: ret i32 [[TMP8]] +; +; SSE4-LABEL: @smax_v4i32( +; SSE4-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16 +; SSE4-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) +; SSE4-NEXT: ret i32 [[TMP3]] +; +; AVX-LABEL: @smax_v4i32( +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16 +; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) +; AVX-NEXT: ret i32 [[TMP3]] ; %2 = load i32, ptr @arr, align 16 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 @@ -100,8 +115,3 @@ %32 = call i32 @llvm.smax.i32(i32 %31, i32 %17) ret i32 %32 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX: {{.*}} -; SSE: {{.*}} -; SSE2: {{.*}} -; SSE4: {{.*}}