diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -3361,7 +3361,7 @@ // the add operand respectively. This allows fmuladd to represent a*b-c, or // c-a*b. Patterns in LLVM should catch the negated forms and translate them to // efficient operations. -static Value* buildFMulAdd(llvm::BinaryOperator *MulOp, Value *Addend, +static Value* buildFMulAdd(llvm::Instruction *MulOp, Value *Addend, const CodeGenFunction &CGF, CGBuilderTy &Builder, bool negMul, bool negAdd) { assert(!(negMul && negAdd) && "Only one of negMul and negAdd should be set."); @@ -3373,12 +3373,25 @@ if (negAdd) Addend = Builder.CreateFNeg(Addend, "neg"); - Value *FMulAdd = Builder.CreateCall( - CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()), - {MulOp0, MulOp1, Addend}); - MulOp->eraseFromParent(); + Value *FMulAdd = nullptr; + if (Builder.getIsFPConstrained()) { + // Only constrained operation should be created when Builder is in FP + // constrained mode. + assert(isa(MulOp) && + "Only constrained operation should be created when Builder is in FP " + "constrained mode"); + FMulAdd = Builder.CreateCall( + CGF.CGM.getIntrinsic(llvm::Intrinsic::experimental_constrained_fmuladd, + Addend->getType()), + {MulOp0, MulOp1, Addend, MulOp->getOperand(2), MulOp->getOperand(3)}); + } else { + FMulAdd = Builder.CreateCall( + CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()), + {MulOp0, MulOp1, Addend}); + } + MulOp->eraseFromParent(); - return FMulAdd; + return FMulAdd; } // Check whether it would be legal to emit an fmuladd intrinsic call to @@ -3413,6 +3426,19 @@ return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false); } + if (auto *LHSBinOp = dyn_cast(op.LHS)) { + if (LHSBinOp->getIntrinsicID() == + llvm::Intrinsic::experimental_constrained_fmul && + LHSBinOp->use_empty()) + return buildFMulAdd(LHSBinOp, op.RHS, CGF, Builder, false, isSub); + } + if (auto *RHSBinOp = dyn_cast(op.RHS)) { + if (RHSBinOp->getIntrinsicID() == + llvm::Intrinsic::experimental_constrained_fmul && + RHSBinOp->use_empty()) + return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false); + } + return nullptr; } diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c --- a/clang/test/CodeGen/constrained-math-builtins.c +++ b/clang/test/CodeGen/constrained-math-builtins.c @@ -148,3 +148,15 @@ // CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata) }; +#pragma STDC FP_CONTRACT ON +void bar(float f) { + f * f + f; + (double)f * f - f; + (long double)-f * f + f; + +// CHECK: call float @llvm.experimental.constrained.fmuladd.f32 +// CHECK: fneg +// CHECK: call double @llvm.experimental.constrained.fmuladd.f64 +// CHECK: fneg +// CHECK: call x86_fp80 @llvm.experimental.constrained.fmuladd.f80 +}; diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -16061,6 +16061,69 @@ performed by '``llvm.experimental.constrained.fcmps``' will raise an exception if either operand is a NAN (QNAN or SNAN). +'``llvm.experimental.constrained.fmuladd``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.fmuladd( , , + , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.fmuladd``' intrinsic represents +multiply-add expressions that can be fused if the code generator determines +that (a) the target instruction set has support for a fused operation, +and (b) that the fused operation is more efficient than the equivalent, +separate pair of mul and add instructions. + +Arguments: +"""""""""" + +The first three arguments to the '``llvm.experimental.constrained.fmuladd``' +intrinsic must be floating-point or vector of floating-point values. +All three arguments must have identical types. + +The fourth and fifth arguments specifiy the rounding mode and exception behavior +as described above. + +Semantics: +"""""""""" + +The expression: + +:: + + %0 = call float @llvm.experimental.constrained.fmuladd.f32(%a, %b, %c, + metadata , + metadata ) + +is equivalent to the expression: + +:: + + %0 = call float @llvm.experimental.constrained.fmul.f32(%a, %b, + metadata , + metadata ) + %1 = call float @llvm.experimental.constrained.fadd.f32(%0, %c, + metadata , + metadata ) + +except that it is unspecified whether rounding will be performed between the +multiplication and addition steps. Fusion is not guaranteed, even if the target +platform supports it. +If a fused multiply-add is required, the corresponding +:ref:`llvm.experimental.constrained.fma ` intrinsic function should be +used instead. +This never sets errno, just as '``llvm.experimental.constrained.fma.*``'. + Constrained libm-equivalent Intrinsics -------------------------------------- diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1286,6 +1286,9 @@ case Intrinsic::fmuladd: ISDs.push_back(ISD::FMA); break; + case Intrinsic::experimental_constrained_fmuladd: + ISDs.push_back(ISD::STRICT_FMA); + break; // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: @@ -1509,6 +1512,12 @@ if (IID == Intrinsic::fmuladd) return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) + ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy); + if (IID == Intrinsic::experimental_constrained_fmuladd) + return ConcreteTTI->getIntrinsicCost( + Intrinsic::experimental_constrained_fmul, RetTy, Tys, + nullptr) + + ConcreteTTI->getIntrinsicCost( + Intrinsic::experimental_constrained_fadd, RetTy, Tys, nullptr); // Else, assume that we need to scalarize this intrinsic. For math builtins // this will emit a costly libcall, adding call overhead and spills. Make it diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def --- a/llvm/include/llvm/IR/ConstrainedOps.def +++ b/llvm/include/llvm/IR/ConstrainedOps.def @@ -95,6 +95,10 @@ DAG_FUNCTION(sqrt, 1, 1, experimental_constrained_sqrt, FSQRT) DAG_FUNCTION(trunc, 1, 0, experimental_constrained_trunc, FTRUNC) +// This is definition for fmuladd intrinsic function, that is converted into +// constrained FMA or FMUL + FADD intrinsics. +FUNCTION(fmuladd, 3, 1, experimental_constrained_fmuladd) + #undef INSTRUCTION #undef FUNCTION #undef CMP_INSTRUCTION diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -626,6 +626,13 @@ llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_fmuladd : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_fptosi : Intrinsic<[ llvm_anyint_ty ], [ llvm_anyfloat_ty, llvm_metadata_ty ]>; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6999,6 +6999,35 @@ Opers.push_back(getValue(FPI.getArgOperand(1))); } + auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) { + assert(Result.getNode()->getNumValues() == 2); + + // Push node to the appropriate list so that future instructions can be + // chained up correctly. + SDValue OutChain = Result.getValue(1); + switch (EB) { + case fp::ExceptionBehavior::ebIgnore: + // The only reason why ebIgnore nodes still need to be chained is that + // they might depend on the current rounding mode, and therefore must + // not be moved across instruction that may change that mode. + LLVM_FALLTHROUGH; + case fp::ExceptionBehavior::ebMayTrap: + // These must not be moved across calls or instructions that may change + // floating-point exception masks. + PendingConstrainedFP.push_back(OutChain); + break; + case fp::ExceptionBehavior::ebStrict: + // These must not be moved across calls or instructions that may change + // floating-point exception masks or read floating-point exception flags. + // In addition, they cannot be optimized out even if unused. + PendingConstrainedFPStrict.push_back(OutChain); + break; + } + }; + + SDVTList VTs = DAG.getVTList(ValueVTs); + fp::ExceptionBehavior EB = FPI.getExceptionBehavior().getValue(); + unsigned Opcode; switch (FPI.getIntrinsicID()) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -7007,6 +7036,23 @@ Opcode = ISD::STRICT_##DAGN; \ break; #include "llvm/IR/ConstrainedOps.def" + case Intrinsic::experimental_constrained_fmuladd: { + Opcode = ISD::STRICT_FMA; + // Break fmuladd into fmul and fadd. + if (TM.Options.AllowFPOpFusion == FPOpFusion::Strict || + !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), + ValueVTs[0])) { + Opers.pop_back(); + SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers); + pushOutChain(Mul, EB); + Opcode = ISD::STRICT_FADD; + Opers.clear(); + Opers.push_back(Mul.getValue(1)); + Opers.push_back(Mul.getValue(0)); + Opers.push_back(getValue(FPI.getArgOperand(2))); + } + break; + } } // A few strict DAG nodes carry additional operands that are not @@ -7025,32 +7071,8 @@ } } - SDVTList VTs = DAG.getVTList(ValueVTs); SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers); - - assert(Result.getNode()->getNumValues() == 2); - - // Push node to the appropriate list so that future instructions can be - // chained up correctly. - SDValue OutChain = Result.getValue(1); - switch (FPI.getExceptionBehavior().getValue()) { - case fp::ExceptionBehavior::ebIgnore: - // The only reason why ebIgnore nodes still need to be chained is that - // they might depend on the current rounding mode, and therefore must - // not be moved across instruction that may change that mode. - LLVM_FALLTHROUGH; - case fp::ExceptionBehavior::ebMayTrap: - // These must not be moved across calls or instructions that may change - // floating-point exception masks. - PendingConstrainedFP.push_back(OutChain); - break; - case fp::ExceptionBehavior::ebStrict: - // These must not be moved across calls or instructions that may change - // floating-point exception masks or read floating-point exception flags. - // In addition, they cannot be optimized out even if unused. - PendingConstrainedFPStrict.push_back(OutChain); - break; - } + pushOutChain(Result, EB); SDValue FPResult = Result.getValue(0); setValue(&FPI, FPResult); diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll @@ -3,6 +3,104 @@ ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=COMMON,FMA ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=COMMON,FMA +; Verify constrained fmul and fadd aren't fused. +define float @f11(float %0, float %1, float %2) #0 { +; NOFMA-LABEL: f11: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f11: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; FMA-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; FMA-NEXT: retq +entry: + %3 = call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %4 = call float @llvm.experimental.constrained.fadd.f32(float %3, float %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %4 +} + +; Verify constrained fmul and fadd aren't fused. +define double @f12(double %0, double %1, double %2) #0 { +; NOFMA-LABEL: f12: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f12: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; FMA-NEXT: vaddsd %xmm2, %xmm0, %xmm0 +; FMA-NEXT: retq +entry: + %3 = call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %4 = call double @llvm.experimental.constrained.fadd.f64(double %3, double %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %4 +} + +; Verify that fmuladd(3.5) isn't simplified when the rounding mode is +; unknown. +define float @f15() #0 { +; NOFMA-LABEL: f15: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NOFMA-NEXT: movaps %xmm1, %xmm0 +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm1, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f15: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 +; FMA-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.fmuladd.f32( + float 3.5, + float 3.5, + float 3.5, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +; Verify that fmuladd(42.1) isn't simplified when the rounding mode is +; unknown. +define double @f16() #0 { +; NOFMA-LABEL: f16: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NOFMA-NEXT: movapd %xmm1, %xmm0 +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm1, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f16: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 +; FMA-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.fmuladd.f64( + double 42.1, + double 42.1, + double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + ; Verify that fma(3.5) isn't simplified when the rounding mode is ; unknown. define float @f17() #0 { @@ -65,5 +163,11 @@ attributes #0 = { strictfp } +declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) +declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) +declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) +declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata) diff --git a/llvm/test/TableGen/GlobalISelEmitter-input-discard.td b/llvm/test/TableGen/GlobalISelEmitter-input-discard.td --- a/llvm/test/TableGen/GlobalISelEmitter-input-discard.td +++ b/llvm/test/TableGen/GlobalISelEmitter-input-discard.td @@ -15,7 +15,7 @@ // GISEL-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, // GISEL-NEXT: GIM_CheckType, /*MI*/0, /*Op*/3, /*Type*/GILLT_s32, // GISEL-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, -// GISEL-NEXT: // (intrinsic_w_chain:{ *:[i32] } 248:{ *:[iPTR] }, srcvalue:{ *:[i32] }, i32:{ *:[i32] }:$src1) => (FOO:{ *:[i32] } (IMPLICIT_DEF:{ *:[i32] }), GPR32:{ *:[i32] }:$src1) +// GISEL-NEXT: // (intrinsic_w_chain:{ *:[i32] } 249:{ *:[iPTR] }, srcvalue:{ *:[i32] }, i32:{ *:[i32] }:$src1) => (FOO:{ *:[i32] } (IMPLICIT_DEF:{ *:[i32] }), GPR32:{ *:[i32] }:$src1) // GISEL-NEXT: GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/GILLT_s32, // GISEL-NEXT: GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::IMPLICIT_DEF, // GISEL-NEXT: GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/RegState::Define,