Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1256,6 +1256,10 @@ InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const; + /// \returns The cost savings from using a floating-point contract + /// instead of separate multiply and add/subtract. + InstructionCost getFMACostSavings(Type *Ty, FastMathFlags FMF) const; + /// \returns The cost of Call instructions. InstructionCost getCallInstrCost( Function *F, Type *RetTy, ArrayRef Tys, Index: llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -36,6 +36,7 @@ class InsertElementInst; class InsertValueInst; class Instruction; +class InstructionCost; class LoopInfo; class OptimizationRemarkEmitter; class PHINode; @@ -88,6 +89,9 @@ /// every time we run into a memory barrier. void collectSeedInstructions(BasicBlock *BB); + /// Adjust cost if vectorizing a tree will lose FMA opportunities. + void adjustForFMAs(InstructionCost &Cost, ArrayRef &VL); + /// Try to vectorize a chain that starts at two arithmetic instrs. bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R); Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -909,6 +909,33 @@ return Cost; } +InstructionCost +TargetTransformInfo::getFMACostSavings(Type *Ty, FastMathFlags FMF) const { + + if (!FMF.allowContract()) + return 0; + + // Savings = Cost(FMul) + Cost(FAdd) - Cost(FMA). + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost MulCost = + getArithmeticInstrCost(Instruction::FMul, Ty, CostKind); + InstructionCost AddCost = + getArithmeticInstrCost(Instruction::FAdd, Ty, CostKind); + + Intrinsic::ID ID = Intrinsic::fmuladd; + SmallVector Tys; + Tys.push_back(Ty); + Tys.push_back(Ty); + ArrayRef ArgTys(Tys); + IntrinsicCostAttributes CostAttrs(ID, Ty, ArgTys); + InstructionCost FMACost = getIntrinsicInstrCost(CostAttrs, CostKind); + + InstructionCost Savings = MulCost + AddCost - FMACost; + if (Savings < 0) + Savings = 0; + return Savings; +} + InstructionCost TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys, Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10076,6 +10076,38 @@ return tryToVectorizeList(VL, R); } +void SLPVectorizerPass::adjustForFMAs(InstructionCost &Cost, + ArrayRef &VL) { + FastMathFlags FMF; + FMF.set(); + + // Only increase cost if every scalar in VL is a floating multiply feeding + // exactly one floating add/subtract, and contracts are enabled. + for (Value *U : VL) { + auto *FPMO = dyn_cast(U); + if (!(FPMO && + FPMO->getOpcode() == Instruction::FMul && + FPMO->hasAllowContract() && + FPMO->hasOneUse())) + return; + auto *I = dyn_cast(FPMO->user_back()); + if (!(I && + (I->getOpcode() == Instruction::FAdd || + I->getOpcode() == Instruction::FSub) && + I->hasAllowContract())) + return; + FMF &= FPMO->getFastMathFlags(); + } + + InstructionCost FMASavings = TTI->getFMACostSavings(VL[0]->getType(), FMF); + + if (FMASavings > 0) { + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VL.size() * FMASavings + << " for vectorization that breaks " << VL.size() << " FMAs\n"); + Cost += FMASavings * VL.size(); + } +} + bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, bool LimitForRegisterSize) { if (VL.size() < 2) @@ -10173,6 +10205,7 @@ R.computeMinimumValueSizes(); InstructionCost Cost = R.getTreeCost(); + adjustForFMAs(Cost, Ops); CandidateFound = true; MinCost = std::min(MinCost, Cost); @@ -11221,6 +11254,21 @@ VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind); ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind); + + if (RdxKind == RecurKind::FAdd) { + auto *I = dyn_cast(FirstReducedVal); + if (I && + I->getOpcode() == Instruction::FMul && + I->hasAllowContract()) { + InstructionCost FMASavings = TTI->getFMACostSavings(ScalarTy, FMF); + if (FMASavings > 0) { + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << ReduxWidth * FMASavings + << " for horizontal reduction that breaks " + << ReduxWidth << " FMAs\n"); + VectorCost += FMASavings * ReduxWidth; + } + } + } break; } case RecurKind::FMax: Index: llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll +++ llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll @@ -5,18 +5,20 @@ ; adds may look profitable, but is not because it eliminates generation of ; floating-point FMAs that would be more profitable. -; FIXME: We generate a horizontal reduction today. - define void @hr() { ; CHECK-LABEL: @hr( ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[CVT0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]] +; CHECK-NEXT: [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]] +; CHECK-NEXT: [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]] +; CHECK-NEXT: [[MUL1:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD0]] +; CHECK-NEXT: [[MUL2:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]] +; CHECK-NEXT: [[MUL3:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[ADD3]] = fadd fast double [[MUL3]], [[ADD2]] ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -45,18 +47,18 @@ ; may look profitable; but both are not because this eliminates generation ; of floating-point FMAs that would be more profitable. -; FIXME: We generate a horizontal reduction today, and if that's disabled, we -; still vectorize some of the multiplies. - define double @hr_or_mul() { ; CHECK-LABEL: @hr_or_mul( ; CHECK-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> , [[SHUFFLE]] -; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP3]], [[CVT0]] -; CHECK-NEXT: ret double [[OP_RDX]] +; CHECK-NEXT: [[MUL0:%.*]] = fmul fast double 7.000000e+00, [[CVT0]] +; CHECK-NEXT: [[ADD0:%.*]] = fadd fast double [[MUL0]], [[CVT0]] +; CHECK-NEXT: [[MUL1:%.*]] = fmul fast double -4.300000e+01, [[CVT0]] +; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD0]] +; CHECK-NEXT: [[MUL2:%.*]] = fmul fast double 2.200000e-02, [[CVT0]] +; CHECK-NEXT: [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]] +; CHECK-NEXT: [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]] +; CHECK-NEXT: [[ADD3:%.*]] = fadd fast double [[MUL3]], [[ADD2]] +; CHECK-NEXT: ret double [[ADD3]] ; %cvt0 = uitofp i16 3 to double %mul0 = fmul fast double 7.000000e+00, %cvt0