diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6441,7 +6441,46 @@ if (NeedToShuffleReuses) { CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; } - InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; + + // Try to estimate the number of operations in VL which could be combined + // with their user to a single fmuladd/fmulsub instruction, if those are + // cheap on the target. + unsigned NumScalarFusable = 0; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + const Instruction *I = cast(VL[i]); + if (!I->hasOneUse() || ShuffleOrOp != Instruction::FMul || + !I->hasAllowReassoc()) + continue; + + const Instruction *UserI = cast(*I->user_begin()); + bool IsCandOpcode = UserI->getOpcode() == Instruction::FSub || + UserI->getOpcode() == Instruction::FAdd; + if (IsCandOpcode && UserI->getOperand(0) == I && + UserI->hasAllowReassoc()) { + // TODO: How can we estimate the cost of fmulsub for which we don't + // have an intrinsic? + FastMathFlags FMF = I->getFastMathFlags(); + FMF &= UserI->getFastMathFlags(); + IntrinsicCostAttributes ICA( + Intrinsic::fmuladd, VL[i]->getType(), + {VL[i]->getType(), VL[i]->getType(), VL[i]->getType()}, FMF); + NumScalarFusable += + TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput) <= + ScalarEltCost; + } + } + + // All operations in VL can be fused to fmuladd/fmulsub with their single + // users. So the whole bundle can be considered free when both scalarized + // or vectorized. If the common cost is low slightly biasing towards using + // the vector version should be profitable in most cases. + if (NumScalarFusable == VecTy->getNumElements()) + return CommonCost - 1; + + // If some of the scalar operations can be fused with their users to + // fmuladd/fmulsub, consider them as free. + InstructionCost ScalarCost = + (VecTy->getNumElements() - NumScalarFusable) * ScalarEltCost; for (unsigned I = 0, Num = VL0->getNumOperands(); I < Num; ++I) { if (all_of(VL, [I](Value *V) { return isConstant(cast(V)->getOperand(I)); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll @@ -7,21 +7,20 @@ ; CHECK-LABEL: @slp_not_profitable_with_fast_fmf( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -47,21 +46,20 @@ ; CHECK-LABEL: @slp_not_profitable_with_reassoc_fmf( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul reassoc float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul reassoc <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub reassoc <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd reassoc <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub reassoc float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul reassoc float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd reassoc float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll @@ -5,18 +5,20 @@ ; adds may look profitable, but is not because it eliminates generation of ; floating-point FMAs that would be more profitable. -; FIXME: We generate a horizontal reduction today. - define void @hr() { ; CHECK-LABEL: @hr( ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[CVT0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]] +; CHECK-NEXT: [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]] +; CHECK-NEXT: [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]] +; CHECK-NEXT: [[MUL1:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD0]] +; CHECK-NEXT: [[MUL2:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]] +; CHECK-NEXT: [[MUL3:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[ADD3]] = fadd fast double [[MUL3]], [[ADD2]] ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: ret void