diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -3361,7 +3361,7 @@ // the add operand respectively. This allows fmuladd to represent a*b-c, or // c-a*b. Patterns in LLVM should catch the negated forms and translate them to // efficient operations. -static Value* buildFMulAdd(llvm::BinaryOperator *MulOp, Value *Addend, +static Value* buildFMulAdd(llvm::Instruction *MulOp, Value *Addend, const CodeGenFunction &CGF, CGBuilderTy &Builder, bool negMul, bool negAdd) { assert(!(negMul && negAdd) && "Only one of negMul and negAdd should be set."); @@ -3373,12 +3373,19 @@ if (negAdd) Addend = Builder.CreateFNeg(Addend, "neg"); - Value *FMulAdd = Builder.CreateCall( - CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()), - {MulOp0, MulOp1, Addend}); - MulOp->eraseFromParent(); + Value *FMulAdd = nullptr; + if (Builder.getIsFPConstrained()) + FMulAdd = Builder.CreateCall( + CGF.CGM.getIntrinsic(llvm::Intrinsic::experimental_constrained_fmuladd, + Addend->getType()), + {MulOp0, MulOp1, Addend, MulOp->getOperand(2), MulOp->getOperand(3)}); + else + FMulAdd = Builder.CreateCall( + CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()), + {MulOp0, MulOp1, Addend}); + MulOp->eraseFromParent(); - return FMulAdd; + return FMulAdd; } // Check whether it would be legal to emit an fmuladd intrinsic call to @@ -3413,6 +3420,21 @@ return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false); } + if (Builder.getIsFPConstrained()) { + if (auto *LHSBinOp = dyn_cast(op.LHS)) { + if (LHSBinOp->getIntrinsicID() == + llvm::Intrinsic::experimental_constrained_fmul && + LHSBinOp->use_empty()) + return buildFMulAdd(LHSBinOp, op.RHS, CGF, Builder, false, isSub); + } + if (auto *RHSBinOp = dyn_cast(op.RHS)) { + if (RHSBinOp->getIntrinsicID() == + llvm::Intrinsic::experimental_constrained_fmul && + RHSBinOp->use_empty()) + return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false); + } + } + return nullptr; } diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c --- a/clang/test/CodeGen/constrained-math-builtins.c +++ b/clang/test/CodeGen/constrained-math-builtins.c @@ -148,3 +148,13 @@ // CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata) }; +#pragma STDC FP_CONTRACT ON +void bar(float f) { + f * f + f; + (double)f * f + f; + (long double)f * f + f; + +// CHECK: declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata) +// CHECK: declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata) +// CHECK: declare x86_fp80 @llvm.experimental.constrained.fmuladd.f80(x86_fp80, x86_fp80, x86_fp80, metadata, metadata) +}; diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -16061,6 +16061,63 @@ performed by '``llvm.experimental.constrained.fcmps``' will raise an exception if either operand is a NAN (QNAN or SNAN). +'``llvm.experimental.constrained.fmuladd``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.fmuladd( , , + , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.fmuladd``' intrinsic represents +multiply-add expressions that can be fused if the code generator determines +that (a) the target instruction set has support for a fused operation, +and (b) that the fused operation is more efficient than the equivalent, +separate pair of mul and add instructions. + +Arguments: +"""""""""" + +The first three arguments to the '``llvm.experimental.constrained.fmuladd``' +intrinsic must be floating-point or vector of floating-point values. +All three arguments must have identical types. + +The fourth and fifth arguments specifie the exception behavior as described +above. + +Semantics: +"""""""""" + +The expression: + +:: + + %0 = call float @llvm.experimental.constrained.fmuladd.f32(%a, %b, %c) + +is equivalent to the expression: + +:: + + %0 = call float @llvm.experimental.constrained.fmul.f32(%a, %b) + %1 = call float @llvm.experimental.constrained.fadd.f32(%0, %c) + +except that it is unspecified whether rounding will be performed between the +multiplication and addition steps. Fusion is not guaranteed, even if the target +platform supports it. +If a fused multiply-add is required, the corresponding +:ref:`llvm.experimental.constrained.fma ` intrinsic function should be +used instead. +This never sets errno, just as '``llvm.experimental.constrained.fma.*``'. + Constrained libm-equivalent Intrinsics -------------------------------------- diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1286,6 +1286,9 @@ case Intrinsic::fmuladd: ISDs.push_back(ISD::FMA); break; + case Intrinsic::experimental_constrained_fmuladd: + ISDs.push_back(ISD::STRICT_FMA); + break; // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: @@ -1509,6 +1512,13 @@ if (IID == Intrinsic::fmuladd) return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) + ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy); + // FIXME: Is constrained intrinsic' cost equal to it's no strict one? + if (IID == Intrinsic::experimental_constrained_fmuladd) + return ConcreteTTI->getIntrinsicCost( + Intrinsic::experimental_constrained_fmul, RetTy, Tys, + nullptr) + + ConcreteTTI->getIntrinsicCost( + Intrinsic::experimental_constrained_fadd, RetTy, Tys, nullptr); // Else, assume that we need to scalarize this intrinsic. For math builtins // this will emit a costly libcall, adding call overhead and spills. Make it diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -352,6 +352,10 @@ /// comparison operation. STRICT_FSETCC, STRICT_FSETCCS, + /// FMULADD/STRICT_FMULADD - A intermediate node, made functions handle + /// constrained fmuladd the same as other constrained intrinsics. + FMULADD, STRICT_FMULADD, + /// FMA - Perform a * b + c with no intermediate rounding step. FMA, diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def --- a/llvm/include/llvm/IR/ConstrainedOps.def +++ b/llvm/include/llvm/IR/ConstrainedOps.def @@ -81,6 +81,10 @@ FUNCTION(sqrt, 1, 1, experimental_constrained_sqrt, FSQRT) FUNCTION(trunc, 1, 0, experimental_constrained_trunc, FTRUNC) +// This is definition for fmuladd intrinsic function, that is converted into +// constrained FMA or FMUL + FADD intrinsics. +FUNCTION(fmuladd, 3, 1, experimental_constrained_fmuladd, FMULADD) + #undef INSTRUCTION #undef FUNCTION #undef CMP_INSTRUCTION diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -626,6 +626,13 @@ llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_fmuladd : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_fptosi : Intrinsic<[ llvm_anyint_ty ], [ llvm_anyfloat_ty, llvm_metadata_ty ]>; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7058,32 +7058,55 @@ } } + auto pushOutChain = [this](auto &Result, auto EB) { + assert(Result.getNode()->getNumValues() == 2); + + // Push node to the appropriate list so that future instructions can be + // chained up correctly. + SDValue OutChain = Result.getValue(1); + switch (EB) { + case fp::ExceptionBehavior::ebIgnore: + // The only reason why ebIgnore nodes still need to be chained is that + // they might depend on the current rounding mode, and therefore must + // not be moved across instruction that may change that mode. + LLVM_FALLTHROUGH; + case fp::ExceptionBehavior::ebMayTrap: + // These must not be moved across calls or instructions that may change + // floating-point exception masks. + PendingConstrainedFP.push_back(OutChain); + break; + case fp::ExceptionBehavior::ebStrict: + // These must not be moved across calls or instructions that may change + // floating-point exception masks or read floating-point exception flags. + // In addition, they cannot be optimized out even if unused. + PendingConstrainedFPStrict.push_back(OutChain); + break; + } + }; + SDVTList VTs = DAG.getVTList(ValueVTs); - SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers); - - assert(Result.getNode()->getNumValues() == 2); - - // Push node to the appropriate list so that future instructions can be - // chained up correctly. - SDValue OutChain = Result.getValue(1); - switch (FPI.getExceptionBehavior().getValue()) { - case fp::ExceptionBehavior::ebIgnore: - // The only reason why ebIgnore nodes still need to be chained is that - // they might depend on the current rounding mode, and therefore must - // not be moved across instruction that may change that mode. - LLVM_FALLTHROUGH; - case fp::ExceptionBehavior::ebMayTrap: - // These must not be moved across calls or instructions that may change - // floating-point exception masks. - PendingConstrainedFP.push_back(OutChain); - break; - case fp::ExceptionBehavior::ebStrict: - // These must not be moved across calls or instructions that may change - // floating-point exception masks or read floating-point exception flags. - // In addition, they cannot be optimized out even if unused. - PendingConstrainedFPStrict.push_back(OutChain); - break; - } + fp::ExceptionBehavior EB = FPI.getExceptionBehavior().getValue(); + SDValue Result; + + if (Opcode == ISD::STRICT_FMULADD) { + Opcode = ISD::STRICT_FMA; + // Break fmuladd into fmul and fadd. + if (TM.Options.AllowFPOpFusion == FPOpFusion::Strict || + !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), + ValueVTs[0])) { + Opers.pop_back(); + Result = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers); + pushOutChain(Result, EB); + Opcode = ISD::STRICT_FADD; + Opers.clear(); + Opers.push_back(Result.getValue(1)); + Opers.push_back(Result.getValue(0)); + Opers.push_back(getValue(FPI.getArgOperand(2))); + } + } + + Result = DAG.getNode(Opcode, sdl, VTs, Opers); + pushOutChain(Result, EB); SDValue FPResult = Result.getValue(0); setValue(&FPI, FPResult); diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll @@ -3,6 +3,104 @@ ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=COMMON,FMA ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=COMMON,FMA +; Verify constrained fmul and fadd aren't fused. +define float @f11(float %0, float %1, float %2) #0 { +; NOFMA-LABEL: f11: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f11: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; FMA-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; FMA-NEXT: retq +entry: + %3 = call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %4 = call float @llvm.experimental.constrained.fadd.f32(float %3, float %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %4 +} + +; Verify constrained fmul and fadd aren't fused. +define double @f12(double %0, double %1, double %2) #0 { +; NOFMA-LABEL: f12: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f12: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; FMA-NEXT: vaddsd %xmm2, %xmm0, %xmm0 +; FMA-NEXT: retq +entry: + %3 = call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %4 = call double @llvm.experimental.constrained.fadd.f64(double %3, double %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %4 +} + +; Verify that fmuladd(3.5) isn't simplified when the rounding mode is +; unknown. +define float @f15() #0 { +; NOFMA-LABEL: f15: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NOFMA-NEXT: movaps %xmm1, %xmm0 +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm1, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f15: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 +; FMA-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.fmuladd.f32( + float 3.5, + float 3.5, + float 3.5, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +; Verify that fmuladd(42.1) isn't simplified when the rounding mode is +; unknown. +define double @f16() #0 { +; NOFMA-LABEL: f16: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NOFMA-NEXT: movapd %xmm1, %xmm0 +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm1, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f16: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 +; FMA-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.fmuladd.f64( + double 42.1, + double 42.1, + double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + ; Verify that fma(3.5) isn't simplified when the rounding mode is ; unknown. define float @f17() #0 { @@ -65,5 +163,11 @@ attributes #0 = { strictfp } +declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) +declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) +declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) +declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata)