diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -3365,7 +3365,7 @@ // the add operand respectively. This allows fmuladd to represent a*b-c, or // c-a*b. Patterns in LLVM should catch the negated forms and translate them to // efficient operations. -static Value* buildFMulAdd(llvm::BinaryOperator *MulOp, Value *Addend, +static Value* buildFMulAdd(llvm::Instruction *MulOp, Value *Addend, const CodeGenFunction &CGF, CGBuilderTy &Builder, bool negMul, bool negAdd) { assert(!(negMul && negAdd) && "Only one of negMul and negAdd should be set."); @@ -3377,12 +3377,23 @@ if (negAdd) Addend = Builder.CreateFNeg(Addend, "neg"); - Value *FMulAdd = Builder.CreateCall( - CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()), - {MulOp0, MulOp1, Addend}); - MulOp->eraseFromParent(); + Value *FMulAdd = nullptr; + if (Builder.getIsFPConstrained()) { + assert(isa(MulOp) && + "Only constrained operation should be created when Builder is in FP " + "constrained mode"); + FMulAdd = Builder.CreateConstrainedFPCall( + CGF.CGM.getIntrinsic(llvm::Intrinsic::experimental_constrained_fmuladd, + Addend->getType()), + {MulOp0, MulOp1, Addend}); + } else { + FMulAdd = Builder.CreateCall( + CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()), + {MulOp0, MulOp1, Addend}); + } + MulOp->eraseFromParent(); - return FMulAdd; + return FMulAdd; } // Check whether it would be legal to emit an fmuladd intrinsic call to @@ -3417,6 +3428,19 @@ return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false); } + if (auto *LHSBinOp = dyn_cast(op.LHS)) { + if (LHSBinOp->getIntrinsicID() == + llvm::Intrinsic::experimental_constrained_fmul && + LHSBinOp->use_empty()) + return buildFMulAdd(LHSBinOp, op.RHS, CGF, Builder, false, isSub); + } + if (auto *RHSBinOp = dyn_cast(op.RHS)) { + if (RHSBinOp->getIntrinsicID() == + llvm::Intrinsic::experimental_constrained_fmul && + RHSBinOp->use_empty()) + return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false); + } + return nullptr; } diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c --- a/clang/test/CodeGen/constrained-math-builtins.c +++ b/clang/test/CodeGen/constrained-math-builtins.c @@ -148,3 +148,15 @@ // CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata) }; +#pragma STDC FP_CONTRACT ON +void bar(float f) { + f * f + f; + (double)f * f - f; + (long double)-f * f + f; + +// CHECK: call float @llvm.experimental.constrained.fmuladd.f32 +// CHECK: fneg +// CHECK: call double @llvm.experimental.constrained.fmuladd.f64 +// CHECK: fneg +// CHECK: call x86_fp80 @llvm.experimental.constrained.fmuladd.f80 +}; diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -16141,6 +16141,69 @@ performed by '``llvm.experimental.constrained.fcmps``' will raise an exception if either operand is a NAN (QNAN or SNAN). +'``llvm.experimental.constrained.fmuladd``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.fmuladd( , , + , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.fmuladd``' intrinsic represents +multiply-add expressions that can be fused if the code generator determines +that (a) the target instruction set has support for a fused operation, +and (b) that the fused operation is more efficient than the equivalent, +separate pair of mul and add instructions. + +Arguments: +"""""""""" + +The first three arguments to the '``llvm.experimental.constrained.fmuladd``' +intrinsic must be floating-point or vector of floating-point values. +All three arguments must have identical types. + +The fourth and fifth arguments specifiy the rounding mode and exception behavior +as described above. + +Semantics: +"""""""""" + +The expression: + +:: + + %0 = call float @llvm.experimental.constrained.fmuladd.f32(%a, %b, %c, + metadata , + metadata ) + +is equivalent to the expression: + +:: + + %0 = call float @llvm.experimental.constrained.fmul.f32(%a, %b, + metadata , + metadata ) + %1 = call float @llvm.experimental.constrained.fadd.f32(%0, %c, + metadata , + metadata ) + +except that it is unspecified whether rounding will be performed between the +multiplication and addition steps. Fusion is not guaranteed, even if the target +platform supports it. +If a fused multiply-add is required, the corresponding +:ref:`llvm.experimental.constrained.fma ` intrinsic function should be +used instead. +This never sets errno, just as '``llvm.experimental.constrained.fma.*``'. + Constrained libm-equivalent Intrinsics -------------------------------------- diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1288,6 +1288,9 @@ case Intrinsic::fmuladd: ISDs.push_back(ISD::FMA); break; + case Intrinsic::experimental_constrained_fmuladd: + ISDs.push_back(ISD::STRICT_FMA); + break; // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: @@ -1511,6 +1514,12 @@ if (IID == Intrinsic::fmuladd) return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) + ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy); + if (IID == Intrinsic::experimental_constrained_fmuladd) + return ConcreteTTI->getIntrinsicCost( + Intrinsic::experimental_constrained_fmul, RetTy, Tys, + nullptr) + + ConcreteTTI->getIntrinsicCost( + Intrinsic::experimental_constrained_fadd, RetTy, Tys, nullptr); // Else, assume that we need to scalarize this intrinsic. For math builtins // this will emit a costly libcall, adding call overhead and spills. Make it diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def --- a/llvm/include/llvm/IR/ConstrainedOps.def +++ b/llvm/include/llvm/IR/ConstrainedOps.def @@ -95,6 +95,10 @@ DAG_FUNCTION(sqrt, 1, 1, experimental_constrained_sqrt, FSQRT) DAG_FUNCTION(trunc, 1, 0, experimental_constrained_trunc, FTRUNC) +// This is definition for fmuladd intrinsic function, that is converted into +// constrained FMA or FMUL + FADD intrinsics. +FUNCTION(fmuladd, 3, 1, experimental_constrained_fmuladd) + #undef INSTRUCTION #undef FUNCTION #undef CMP_INSTRUCTION diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -640,6 +640,13 @@ llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_fmuladd : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_fptosi : Intrinsic<[ llvm_anyint_ty ], [ llvm_anyfloat_ty, llvm_metadata_ty ]>; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7021,6 +7021,35 @@ Opers.push_back(getValue(FPI.getArgOperand(1))); } + auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) { + assert(Result.getNode()->getNumValues() == 2); + + // Push node to the appropriate list so that future instructions can be + // chained up correctly. + SDValue OutChain = Result.getValue(1); + switch (EB) { + case fp::ExceptionBehavior::ebIgnore: + // The only reason why ebIgnore nodes still need to be chained is that + // they might depend on the current rounding mode, and therefore must + // not be moved across instruction that may change that mode. + LLVM_FALLTHROUGH; + case fp::ExceptionBehavior::ebMayTrap: + // These must not be moved across calls or instructions that may change + // floating-point exception masks. + PendingConstrainedFP.push_back(OutChain); + break; + case fp::ExceptionBehavior::ebStrict: + // These must not be moved across calls or instructions that may change + // floating-point exception masks or read floating-point exception flags. + // In addition, they cannot be optimized out even if unused. + PendingConstrainedFPStrict.push_back(OutChain); + break; + } + }; + + SDVTList VTs = DAG.getVTList(ValueVTs); + fp::ExceptionBehavior EB = FPI.getExceptionBehavior().getValue(); + unsigned Opcode; switch (FPI.getIntrinsicID()) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -7029,6 +7058,23 @@ Opcode = ISD::STRICT_##DAGN; \ break; #include "llvm/IR/ConstrainedOps.def" + case Intrinsic::experimental_constrained_fmuladd: { + Opcode = ISD::STRICT_FMA; + // Break fmuladd into fmul and fadd. + if (TM.Options.AllowFPOpFusion == FPOpFusion::Strict || + !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), + ValueVTs[0])) { + Opers.pop_back(); + SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers); + pushOutChain(Mul, EB); + Opcode = ISD::STRICT_FADD; + Opers.clear(); + Opers.push_back(Mul.getValue(1)); + Opers.push_back(Mul.getValue(0)); + Opers.push_back(getValue(FPI.getArgOperand(2))); + } + break; + } } // A few strict DAG nodes carry additional operands that are not @@ -7047,32 +7093,8 @@ } } - SDVTList VTs = DAG.getVTList(ValueVTs); SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers); - - assert(Result.getNode()->getNumValues() == 2); - - // Push node to the appropriate list so that future instructions can be - // chained up correctly. - SDValue OutChain = Result.getValue(1); - switch (FPI.getExceptionBehavior().getValue()) { - case fp::ExceptionBehavior::ebIgnore: - // The only reason why ebIgnore nodes still need to be chained is that - // they might depend on the current rounding mode, and therefore must - // not be moved across instruction that may change that mode. - LLVM_FALLTHROUGH; - case fp::ExceptionBehavior::ebMayTrap: - // These must not be moved across calls or instructions that may change - // floating-point exception masks. - PendingConstrainedFP.push_back(OutChain); - break; - case fp::ExceptionBehavior::ebStrict: - // These must not be moved across calls or instructions that may change - // floating-point exception masks or read floating-point exception flags. - // In addition, they cannot be optimized out even if unused. - PendingConstrainedFPStrict.push_back(OutChain); - break; - } + pushOutChain(Result, EB); SDValue FPResult = Result.getValue(0); setValue(&FPI, FPResult); diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll @@ -322,6 +322,128 @@ ret double %result } +; Verify constrained fmul and fadd aren't fused. +define float @f11(float %0, float %1, float %2) #0 { +; NOFMA-LABEL: f11: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f11: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; FMA-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: f11: +; FMA4: # %bb.0: # %entry +; FMA4-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; FMA4-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; FMA4-NEXT: retq +entry: + %3 = call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %4 = call float @llvm.experimental.constrained.fadd.f32(float %3, float %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %4 +} + +; Verify constrained fmul and fadd aren't fused. +define double @f12(double %0, double %1, double %2) #0 { +; NOFMA-LABEL: f12: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f12: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; FMA-NEXT: vaddsd %xmm2, %xmm0, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: f12: +; FMA4: # %bb.0: # %entry +; FMA4-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; FMA4-NEXT: vaddsd %xmm2, %xmm0, %xmm0 +; FMA4-NEXT: retq +entry: + %3 = call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %4 = call double @llvm.experimental.constrained.fadd.f64(double %3, double %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %4 +} + +; Verify that fmuladd(3.5) isn't simplified when the rounding mode is +; unknown. +define float @f15() #0 { +; NOFMA-LABEL: f15: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NOFMA-NEXT: movaps %xmm1, %xmm0 +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm1, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f15: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: f15: +; FMA4: # %bb.0: # %entry +; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FMA4-NEXT: vfmaddss %xmm0, %xmm0, %xmm0, %xmm0 +; FMA4-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.fmuladd.f32( + float 3.5, + float 3.5, + float 3.5, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +; Verify that fmuladd(42.1) isn't simplified when the rounding mode is +; unknown. +define double @f16() #0 { +; NOFMA-LABEL: f16: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NOFMA-NEXT: movapd %xmm1, %xmm0 +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm1, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f16: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: f16: +; FMA4: # %bb.0: # %entry +; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FMA4-NEXT: vfmaddsd %xmm0, %xmm0, %xmm0, %xmm0 +; FMA4-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.fmuladd.f64( + double 42.1, + double 42.1, + double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + ; Verify that fma(3.5) isn't simplified when the rounding mode is ; unknown. define float @f17() #0 { @@ -954,7 +1076,13 @@ attributes #0 = { strictfp } +declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) +declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) +declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata) +declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata)