diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -773,6 +773,37 @@ TTIImpl->getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, Args, CxtIs); + + // Check the context instructions to see if we have FMUL instructions that can + // be fused with their users to a fmuladd/fmulsub instruction. In those cases, + // consider the FMUL free if the cost of the fmuladd/fmulsub is less than or + // equal to the cost of the FAdd/FSub alone. + if (Opcode == Instruction::FMul && + ((isa(Ty) && CxtIs.size() > 1) || CxtIs.size() == 1) && + all_of(CxtIs, [&](const Instruction *CxtI) { + if (!CxtI || CxtI->getOpcode() != Opcode || !CxtI->hasOneUse() || + !CxtI->hasAllowReassoc()) + return false; + const Instruction *UserI = cast(*CxtI->user_begin()); + bool IsCandOpcode = UserI->getOpcode() == Instruction::FSub || + UserI->getOpcode() == Instruction::FAdd; + if (IsCandOpcode && UserI->getOperand(0) == CxtI && + UserI->hasAllowReassoc()) { + // TODO: How can we estimate the cost of fmulsub for which we don't + // have an intrinsic? + FastMathFlags FMF = CxtI->getFastMathFlags(); + FMF &= UserI->getFastMathFlags(); + IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, + FMF); + return getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput) <= + TTIImpl->getArithmeticInstrCost(Instruction::FAdd, Ty, + CostKind, Op1Info, Op2Info, + Args, CxtIs); + } + return false; + })) + return TCC_Free; + assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6467,6 +6467,13 @@ TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1Info, Op2Info, Operands, OpsForVector); + if (VecCost == TargetTransformInfo::TCC_Free) { + // The vector instruction is considered free so it can likely be folded + // with it's users. This means there's no need to consider re-ordered + // costs for the folded instruction. + CommonCost = 0; + } + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); return CommonCost + VecCost - ScalarCost; } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll @@ -7,21 +7,20 @@ ; CHECK-LABEL: @slp_not_profitable_with_fast_fmf( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 @@ -47,21 +46,20 @@ ; CHECK-LABEL: @slp_not_profitable_with_reassoc_fmf( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul reassoc float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fmul reassoc <2 x float> [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub reassoc <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd reassoc <2 x float> [[TMP7]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: store float [[TMP11]], ptr [[B]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 +; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]] +; CHECK-NEXT: [[SUB:%.*]] = fsub reassoc float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]] +; CHECK-NEXT: [[MUL_3:%.*]] = fmul reassoc float [[B_2]], [[A_0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd reassoc float [[MUL_3]], [[MUL_2]] +; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 +; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll @@ -5,18 +5,20 @@ ; adds may look profitable, but is not because it eliminates generation of ; floating-point FMAs that would be more profitable. -; FIXME: We generate a horizontal reduction today. - define void @hr() { ; CHECK-LABEL: @hr( ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[CVT0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]] +; CHECK-NEXT: [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]] +; CHECK-NEXT: [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]] +; CHECK-NEXT: [[MUL1:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD0]] +; CHECK-NEXT: [[MUL2:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]] +; CHECK-NEXT: [[MUL3:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[ADD3]] = fadd fast double [[MUL3]], [[ADD2]] ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: ret void