diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1147,18 +1147,6 @@ /// Construct a vectorizable tree that starts at \p Roots. void buildTree(ArrayRef Roots); - /// Checks if the very first tree node is going to be vectorized. - bool isVectorizedFirstNode() const { - return !VectorizableTree.empty() && - VectorizableTree.front()->State == TreeEntry::Vectorize; - } - - /// Returns the main instruction for the very first node. - Instruction *getFirstNodeMainOp() const { - assert(!VectorizableTree.empty() && "No tree to get the first node from"); - return VectorizableTree.front()->getMainOp(); - } - /// Returns whether the root node has in-tree uses. bool doesRootHaveInTreeUses() const { return !VectorizableTree.empty() && @@ -13313,22 +13301,7 @@ // Estimate cost. InstructionCost TreeCost = V.getTreeCost(VL); InstructionCost ReductionCost = - getReductionCost(TTI, VL, ReduxWidth, RdxFMF); - if (V.isVectorizedFirstNode() && isa(VL.front())) { - Instruction *MainOp = V.getFirstNodeMainOp(); - for (Value *V : VL) { - auto *VI = dyn_cast(V); - // Add the costs of scalar GEP pointers, to be removed from the - // code. - if (!VI || VI == MainOp) - continue; - auto *Ptr = dyn_cast(VI->getPointerOperand()); - if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices()) - continue; - TreeCost -= TTI->getArithmeticInstrCost( - Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput); - } - } + getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF); InstructionCost Cost = TreeCost + ReductionCost; LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); if (!Cost.isValid()) @@ -13564,7 +13537,8 @@ /// Calculate the cost of a reduction. InstructionCost getReductionCost(TargetTransformInfo *TTI, ArrayRef ReducedVals, - unsigned ReduxWidth, FastMathFlags FMF) { + bool IsCmpSelMinMax, unsigned ReduxWidth, + FastMathFlags FMF) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Value *FirstReducedVal = ReducedVals.front(); Type *ScalarTy = FirstReducedVal->getType(); @@ -13573,6 +13547,35 @@ // If all of the reduced values are constant, the vector cost is 0, since // the reduction value can be calculated at the compile time. bool AllConsts = allConstant(ReducedVals); + auto EvaluateScalarCost = [&](function_ref GenCostFn) { + InstructionCost Cost = 0; + // Scalar cost is repeated for N-1 elements. + int Cnt = ReducedVals.size(); + for (Value *RdxVal : ReducedVals) { + if (Cnt == 1) + break; + --Cnt; + if (RdxVal->hasNUsesOrMore(3)) { + Cost += GenCostFn(); + continue; + } + InstructionCost ScalarCost = 0; + for (User *U : RdxVal->users()) { + auto *RdxOp = cast(U); + if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) { + ScalarCost += TTI->getInstructionCost(RdxOp, CostKind); + continue; + } + ScalarCost = InstructionCost::getInvalid(); + break; + } + if (ScalarCost.isValid()) + Cost += ScalarCost; + else + Cost += GenCostFn(); + } + return Cost; + }; switch (RdxKind) { case RecurKind::Add: case RecurKind::Mul: @@ -13585,7 +13588,9 @@ if (!AllConsts) VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); - ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); + ScalarCost = EvaluateScalarCost([&]() { + return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); + }); break; } case RecurKind::FMax: @@ -13599,10 +13604,12 @@ /*IsUnsigned=*/false, CostKind); } CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); - ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, - SclCondTy, RdxPred, CostKind) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, - SclCondTy, RdxPred, CostKind); + ScalarCost = EvaluateScalarCost([&]() { + return TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy, + RdxPred, CostKind) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy, + RdxPred, CostKind); + }); break; } case RecurKind::SMax: @@ -13619,18 +13626,18 @@ IsUnsigned, CostKind); } CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind); - ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, - SclCondTy, RdxPred, CostKind) + - TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, - SclCondTy, RdxPred, CostKind); + ScalarCost = EvaluateScalarCost([&]() { + return TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, SclCondTy, + RdxPred, CostKind) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy, + RdxPred, CostKind); + }); break; } default: llvm_unreachable("Expected arithmetic or min/max reduction operation"); } - // Scalar cost is repeated for N-1 elements. - ScalarCost *= (ReduxWidth - 1); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost << " for reduction that starts with " << *FirstReducedVal << " (It is a splitting reduction)\n"); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -6,12 +6,15 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> [[V1:%.*]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]] -; CHECK-NEXT: ret <2 x i64> [[TMP5]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <2 x i64> [[TMP3_0]], i64 [[TMP2_1]], i64 1 +; CHECK-NEXT: ret <2 x i64> [[TMP3_1]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -30,14 +33,14 @@ define void @store_chain_v2i64(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @store_chain_v2i64( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: store <2 x i64> [[TMP9]], ptr [[C:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[C:%.*]], align 8 ; CHECK-NEXT: ret void ; %a.1 = getelementptr i64, ptr %a, i64 1 @@ -60,12 +63,29 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] -; CHECK-NEXT: ret <4 x i32> [[TMP5]] +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <4 x i32> [[V0:%.*]], i64 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <4 x i32> [[V0]], i64 1 +; CHECK-NEXT: [[V0_2:%.*]] = extractelement <4 x i32> [[V0]], i64 2 +; CHECK-NEXT: [[V0_3:%.*]] = extractelement <4 x i32> [[V0]], i64 3 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <4 x i32> [[V1]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> +; CHECK-NEXT: [[V1_2:%.*]] = extractelement <4 x i32> [[V1]], i64 2 +; CHECK-NEXT: [[V1_3:%.*]] = extractelement <4 x i32> [[V1]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> +; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[V0_2]], [[V1_2]] +; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[V0_3]], [[V1_3]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP2_3:%.*]] = add i32 [[TMP1_2]], [[TMP1_3]] +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i64 1 +; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP4]], i64 2 +; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_3]], i64 3 +; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -96,13 +116,17 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i64 1 +; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP3]], i64 2 +; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_1]], i64 3 +; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 @@ -167,10 +191,10 @@ ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]] -; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -6,12 +6,15 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> [[V1:%.*]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]] -; CHECK-NEXT: ret <2 x i64> [[TMP5]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <2 x i64> [[TMP3_0]], i64 [[TMP2_1]], i64 1 +; CHECK-NEXT: ret <2 x i64> [[TMP3_1]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -30,14 +33,14 @@ define void @store_chain_v2i64(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @store_chain_v2i64( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: store <2 x i64> [[TMP9]], ptr [[C:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[C:%.*]], align 8 ; CHECK-NEXT: ret void ; %a.1 = getelementptr i64, ptr %a, i64 1 @@ -60,12 +63,29 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]] -; CHECK-NEXT: ret <4 x i32> [[TMP5]] +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <4 x i32> [[V0:%.*]], i64 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <4 x i32> [[V0]], i64 1 +; CHECK-NEXT: [[V0_2:%.*]] = extractelement <4 x i32> [[V0]], i64 2 +; CHECK-NEXT: [[V0_3:%.*]] = extractelement <4 x i32> [[V0]], i64 3 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <4 x i32> [[V1]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> +; CHECK-NEXT: [[V1_2:%.*]] = extractelement <4 x i32> [[V1]], i64 2 +; CHECK-NEXT: [[V1_3:%.*]] = extractelement <4 x i32> [[V1]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> [[V1]], <4 x i32> +; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[V0_2]], [[V1_2]] +; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[V0_3]], [[V1_3]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP2_3:%.*]] = add i32 [[TMP1_2]], [[TMP1_3]] +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i64 1 +; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP4]], i64 2 +; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_3]], i64 3 +; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -96,13 +116,17 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> [[V1:%.*]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i64 1 +; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP3]], i64 2 +; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_1]], i64 3 +; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 @@ -167,10 +191,10 @@ ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]] -; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll @@ -88,11 +88,46 @@ ret i32 %16 } +; FIXME: looks like the cost of @llvm.smax.i32 is not correct, lowered as select+cmp define i32 @smax_v16i32(i32) { -; CHECK-LABEL: @smax_v16i32( -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @arr, align 16 -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]]) -; CHECK-NEXT: ret i32 [[TMP3]] +; SSE-LABEL: @smax_v16i32( +; SSE-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 +; SSE-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 +; SSE-NEXT: [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; SSE-NEXT: [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 8), align 16 +; SSE-NEXT: [[TMP11:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 9), align 4 +; SSE-NEXT: [[TMP12:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 10), align 8 +; SSE-NEXT: [[TMP13:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 11), align 4 +; SSE-NEXT: [[TMP14:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 12), align 16 +; SSE-NEXT: [[TMP15:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 13), align 4 +; SSE-NEXT: [[TMP16:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 14), align 8 +; SSE-NEXT: [[TMP17:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 15), align 4 +; SSE-NEXT: [[TMP18:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]]) +; SSE-NEXT: [[TMP19:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP18]], i32 [[TMP4]]) +; SSE-NEXT: [[TMP20:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP19]], i32 [[TMP5]]) +; SSE-NEXT: [[TMP21:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP20]], i32 [[TMP6]]) +; SSE-NEXT: [[TMP22:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP21]], i32 [[TMP7]]) +; SSE-NEXT: [[TMP23:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP22]], i32 [[TMP8]]) +; SSE-NEXT: [[TMP24:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP23]], i32 [[TMP9]]) +; SSE-NEXT: [[TMP25:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP24]], i32 [[TMP10]]) +; SSE-NEXT: [[TMP26:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP25]], i32 [[TMP11]]) +; SSE-NEXT: [[TMP27:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP26]], i32 [[TMP12]]) +; SSE-NEXT: [[TMP28:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP27]], i32 [[TMP13]]) +; SSE-NEXT: [[TMP29:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP28]], i32 [[TMP14]]) +; SSE-NEXT: [[TMP30:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP29]], i32 [[TMP15]]) +; SSE-NEXT: [[TMP31:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP30]], i32 [[TMP16]]) +; SSE-NEXT: [[TMP32:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP31]], i32 [[TMP17]]) +; SSE-NEXT: ret i32 [[TMP32]] +; +; AVX-LABEL: @smax_v16i32( +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @arr, align 16 +; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]]) +; AVX-NEXT: ret i32 [[TMP3]] ; %2 = load i32, ptr @arr, align 16 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4