Index: clang/lib/CodeGen/CGExprScalar.cpp =================================================================== --- clang/lib/CodeGen/CGExprScalar.cpp +++ clang/lib/CodeGen/CGExprScalar.cpp @@ -3361,7 +3361,7 @@ // the add operand respectively. This allows fmuladd to represent a*b-c, or // c-a*b. Patterns in LLVM should catch the negated forms and translate them to // efficient operations. -static Value* buildFMulAdd(llvm::BinaryOperator *MulOp, Value *Addend, +static Value* buildFMulAdd(llvm::Instruction *MulOp, Value *Addend, const CodeGenFunction &CGF, CGBuilderTy &Builder, bool negMul, bool negAdd) { assert(!(negMul && negAdd) && "Only one of negMul and negAdd should be set."); @@ -3373,12 +3373,19 @@ if (negAdd) Addend = Builder.CreateFNeg(Addend, "neg"); - Value *FMulAdd = Builder.CreateCall( - CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()), - {MulOp0, MulOp1, Addend}); - MulOp->eraseFromParent(); + Value *FMulAdd = nullptr; + if (Builder.getIsFPConstrained()) + FMulAdd = Builder.CreateCall( + CGF.CGM.getIntrinsic(llvm::Intrinsic::experimental_constrained_fmuladd, + Addend->getType()), + {MulOp0, MulOp1, Addend, MulOp->getOperand(2), MulOp->getOperand(3)}); + else + FMulAdd = Builder.CreateCall( + CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()), + {MulOp0, MulOp1, Addend}); + MulOp->eraseFromParent(); - return FMulAdd; + return FMulAdd; } // Check whether it would be legal to emit an fmuladd intrinsic call to @@ -3413,6 +3420,21 @@ return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false); } + if (Builder.getIsFPConstrained()) { + if (auto *LHSBinOp = dyn_cast(op.LHS)) { + if (LHSBinOp->getIntrinsicID() == + llvm::Intrinsic::experimental_constrained_fmul && + LHSBinOp->use_empty()) + return buildFMulAdd(LHSBinOp, op.RHS, CGF, Builder, false, isSub); + } + if (auto *RHSBinOp = dyn_cast(op.RHS)) { + if (RHSBinOp->getIntrinsicID() == + llvm::Intrinsic::experimental_constrained_fmul && + RHSBinOp->use_empty()) + return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false); + } + } + return nullptr; } Index: clang/test/CodeGen/constrained-math-builtins.c =================================================================== --- clang/test/CodeGen/constrained-math-builtins.c +++ clang/test/CodeGen/constrained-math-builtins.c @@ -148,3 +148,13 @@ // CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata) }; +#pragma STDC FP_CONTRACT ON +void bar(float f) { + f * f + f; + (double)f * f + f; + (long double)f * f + f; + +// CHECK: declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata) +// CHECK: declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata) +// CHECK: declare x86_fp80 @llvm.experimental.constrained.fmuladd.f80(x86_fp80, x86_fp80, x86_fp80, metadata, metadata) +}; Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -16061,6 +16061,63 @@ performed by '``llvm.experimental.constrained.fcmps``' will raise an exception if either operand is a NAN (QNAN or SNAN). +'``llvm.experimental.constrained.fmuladd``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.fmuladd( , , + , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.fmuladd``' intrinsic represents +multiply-add expressions that can be fused if the code generator determines +that (a) the target instruction set has support for a fused operation, +and (b) that the fused operation is more efficient than the equivalent, +separate pair of mul and add instructions. + +Arguments: +"""""""""" + +The first three arguments to the '``llvm.experimental.constrained.fmuladd``' +intrinsic must be floating-point or vector of floating-point values. +All three arguments must have identical types. + +The fourth and fifth arguments specifie the exception behavior as described +above. + +Semantics: +"""""""""" + +The expression: + +:: + + %0 = call float @llvm.experimental.constrained.fmuladd.f32(%a, %b, %c) + +is equivalent to the expression: + +:: + + %0 = call float @llvm.experimental.constrained.fmul.f32(%a, %b) + %1 = call float @llvm.experimental.constrained.fadd.f32(%0, %c) + +except that it is unspecified whether rounding will be performed between the +multiplication and addition steps. Fusion is not guaranteed, even if the target +platform supports it. +If a fused multiply-add is required, the corresponding +:ref:`llvm.experimental.constrained.fma ` intrinsic function should be +used instead. +This never sets errno, just as '``llvm.experimental.constrained.fma.*``'. + Constrained libm-equivalent Intrinsics -------------------------------------- Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1286,6 +1286,9 @@ case Intrinsic::fmuladd: ISDs.push_back(ISD::FMA); break; + case Intrinsic::experimental_constrained_fmuladd: + ISDs.push_back(ISD::STRICT_FMA); + break; // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: @@ -1509,6 +1512,13 @@ if (IID == Intrinsic::fmuladd) return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) + ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy); + // FIXME: Is constrained intrinsic' cost equal to it's no strict one? + if (IID == Intrinsic::experimental_constrained_fmuladd) + return ConcreteTTI->getIntrinsicCost( + Intrinsic::experimental_constrained_fmul, RetTy, Tys, + nullptr) + + ConcreteTTI->getIntrinsicCost( + Intrinsic::experimental_constrained_fadd, RetTy, Tys, nullptr); // Else, assume that we need to scalarize this intrinsic. For math builtins // this will emit a costly libcall, adding call overhead and spills. Make it Index: llvm/include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -701,7 +701,7 @@ switch (NodeType) { default: return false; -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: #include "llvm/IR/ConstrainedOps.def" return true; Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -964,7 +964,7 @@ unsigned EqOpc; switch (Op) { default: llvm_unreachable("Unexpected FP pseudo-opcode"); -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: EqOpc = ISD::DAGN; break; #define CMP_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: EqOpc = ISD::SETCC; break; Index: llvm/include/llvm/IR/ConstrainedOps.def =================================================================== --- llvm/include/llvm/IR/ConstrainedOps.def +++ llvm/include/llvm/IR/ConstrainedOps.def @@ -11,18 +11,32 @@ // //===----------------------------------------------------------------------===// +// DAG_FUNCTION defers to DAG_INSTRUCTION if its defined, otherwise FUNCTION. +#ifndef DAG_FUNCTION +#ifdef DAG_INSTRUCTION +#define DAG_FUNCTION(N,A,R,I,D) DAG_INSTRUCTION(N,A,R,I,D) +#else +#define DAG_FUNCTION(N,A,R,I,D) FUNCTION(N,A,R,I) +#endif +#endif + #ifndef INSTRUCTION -#define INSTRUCTION(N,A,R,I,D) +#define INSTRUCTION(N,A,R,I) +#endif + +// DAG_INSTRUCTION is treated like an INSTRUCTION if the DAG node isn't used. +#ifndef DAG_INSTRUCTION +#define DAG_INSTRUCTION(N,A,R,I,D) INSTRUCTION(N,A,R,I) #endif // In most cases intrinsic function is handled similar to instruction. #ifndef FUNCTION -#define FUNCTION INSTRUCTION +#define FUNCTION(N,A,R,I) INSTRUCTION(N,A,R,I) #endif -// Likewise for compare instructions. +// Compare instruction have a DAG node so they are treated like DAG_INSTRUCTION. #ifndef CMP_INSTRUCTION -#define CMP_INSTRUCTION INSTRUCTION +#define CMP_INSTRUCTION(N,A,R,I,D) DAG_INSTRUCTION(N,A,R,I,D) #endif // Arguments of the entries are: @@ -35,52 +49,58 @@ // These are definitions for instructions, that are converted into constrained // intrinsics. // -INSTRUCTION(FAdd, 2, 1, experimental_constrained_fadd, FADD) -INSTRUCTION(FSub, 2, 1, experimental_constrained_fsub, FSUB) -INSTRUCTION(FMul, 2, 1, experimental_constrained_fmul, FMUL) -INSTRUCTION(FDiv, 2, 1, experimental_constrained_fdiv, FDIV) -INSTRUCTION(FRem, 2, 1, experimental_constrained_frem, FREM) -INSTRUCTION(FPExt, 1, 0, experimental_constrained_fpext, FP_EXTEND) -INSTRUCTION(SIToFP, 1, 1, experimental_constrained_sitofp, SINT_TO_FP) -INSTRUCTION(UIToFP, 1, 1, experimental_constrained_uitofp, UINT_TO_FP) -INSTRUCTION(FPToSI, 1, 0, experimental_constrained_fptosi, FP_TO_SINT) -INSTRUCTION(FPToUI, 1, 0, experimental_constrained_fptoui, FP_TO_UINT) -INSTRUCTION(FPTrunc, 1, 1, experimental_constrained_fptrunc, FP_ROUND) +DAG_INSTRUCTION(FAdd, 2, 1, experimental_constrained_fadd, FADD) +DAG_INSTRUCTION(FSub, 2, 1, experimental_constrained_fsub, FSUB) +DAG_INSTRUCTION(FMul, 2, 1, experimental_constrained_fmul, FMUL) +DAG_INSTRUCTION(FDiv, 2, 1, experimental_constrained_fdiv, FDIV) +DAG_INSTRUCTION(FRem, 2, 1, experimental_constrained_frem, FREM) +DAG_INSTRUCTION(FPExt, 1, 0, experimental_constrained_fpext, FP_EXTEND) +DAG_INSTRUCTION(SIToFP, 1, 1, experimental_constrained_sitofp, SINT_TO_FP) +DAG_INSTRUCTION(UIToFP, 1, 1, experimental_constrained_uitofp, UINT_TO_FP) +DAG_INSTRUCTION(FPToSI, 1, 0, experimental_constrained_fptosi, FP_TO_SINT) +DAG_INSTRUCTION(FPToUI, 1, 0, experimental_constrained_fptoui, FP_TO_UINT) +DAG_INSTRUCTION(FPTrunc, 1, 1, experimental_constrained_fptrunc, FP_ROUND) // These are definitions for compare instructions (signaling and quiet version). // Both of these match to FCmp / SETCC. -CMP_INSTRUCTION(FCmp, 2, 0, experimental_constrained_fcmp, FSETCC) -CMP_INSTRUCTION(FCmp, 2, 0, experimental_constrained_fcmps, FSETCCS) +CMP_INSTRUCTION(FCmp, 2, 0, experimental_constrained_fcmp, FSETCC) +CMP_INSTRUCTION(FCmp, 2, 0, experimental_constrained_fcmps, FSETCCS) // Theses are definitions for intrinsic functions, that are converted into // constrained intrinsics. // -FUNCTION(ceil, 1, 0, experimental_constrained_ceil, FCEIL) -FUNCTION(cos, 1, 1, experimental_constrained_cos, FCOS) -FUNCTION(exp, 1, 1, experimental_constrained_exp, FEXP) -FUNCTION(exp2, 1, 1, experimental_constrained_exp2, FEXP2) -FUNCTION(floor, 1, 0, experimental_constrained_floor, FFLOOR) -FUNCTION(fma, 3, 1, experimental_constrained_fma, FMA) -FUNCTION(log, 1, 1, experimental_constrained_log, FLOG) -FUNCTION(log10, 1, 1, experimental_constrained_log10, FLOG10) -FUNCTION(log2, 1, 1, experimental_constrained_log2, FLOG2) -FUNCTION(lrint, 1, 1, experimental_constrained_lrint, LRINT) -FUNCTION(llrint, 1, 1, experimental_constrained_llrint, LLRINT) -FUNCTION(lround, 1, 0, experimental_constrained_lround, LROUND) -FUNCTION(llround, 1, 0, experimental_constrained_llround, LLROUND) -FUNCTION(maxnum, 2, 0, experimental_constrained_maxnum, FMAXNUM) -FUNCTION(minnum, 2, 0, experimental_constrained_minnum, FMINNUM) -FUNCTION(maximum, 2, 0, experimental_constrained_maximum, FMAXIMUM) -FUNCTION(minimum, 2, 0, experimental_constrained_minimum, FMINIMUM) -FUNCTION(nearbyint, 1, 1, experimental_constrained_nearbyint, FNEARBYINT) -FUNCTION(pow, 2, 1, experimental_constrained_pow, FPOW) -FUNCTION(powi, 2, 1, experimental_constrained_powi, FPOWI) -FUNCTION(rint, 1, 1, experimental_constrained_rint, FRINT) -FUNCTION(round, 1, 0, experimental_constrained_round, FROUND) -FUNCTION(sin, 1, 1, experimental_constrained_sin, FSIN) -FUNCTION(sqrt, 1, 1, experimental_constrained_sqrt, FSQRT) -FUNCTION(trunc, 1, 0, experimental_constrained_trunc, FTRUNC) +DAG_FUNCTION(ceil, 1, 0, experimental_constrained_ceil, FCEIL) +DAG_FUNCTION(cos, 1, 1, experimental_constrained_cos, FCOS) +DAG_FUNCTION(exp, 1, 1, experimental_constrained_exp, FEXP) +DAG_FUNCTION(exp2, 1, 1, experimental_constrained_exp2, FEXP2) +DAG_FUNCTION(floor, 1, 0, experimental_constrained_floor, FFLOOR) +DAG_FUNCTION(fma, 3, 1, experimental_constrained_fma, FMA) +DAG_FUNCTION(log, 1, 1, experimental_constrained_log, FLOG) +DAG_FUNCTION(log10, 1, 1, experimental_constrained_log10, FLOG10) +DAG_FUNCTION(log2, 1, 1, experimental_constrained_log2, FLOG2) +DAG_FUNCTION(lrint, 1, 1, experimental_constrained_lrint, LRINT) +DAG_FUNCTION(llrint, 1, 1, experimental_constrained_llrint, LLRINT) +DAG_FUNCTION(lround, 1, 0, experimental_constrained_lround, LROUND) +DAG_FUNCTION(llround, 1, 0, experimental_constrained_llround, LLROUND) +DAG_FUNCTION(maxnum, 2, 0, experimental_constrained_maxnum, FMAXNUM) +DAG_FUNCTION(minnum, 2, 0, experimental_constrained_minnum, FMINNUM) +DAG_FUNCTION(maximum, 2, 0, experimental_constrained_maximum, FMAXIMUM) +DAG_FUNCTION(minimum, 2, 0, experimental_constrained_minimum, FMINIMUM) +DAG_FUNCTION(nearbyint, 1, 1, experimental_constrained_nearbyint, FNEARBYINT) +DAG_FUNCTION(pow, 2, 1, experimental_constrained_pow, FPOW) +DAG_FUNCTION(powi, 2, 1, experimental_constrained_powi, FPOWI) +DAG_FUNCTION(rint, 1, 1, experimental_constrained_rint, FRINT) +DAG_FUNCTION(round, 1, 0, experimental_constrained_round, FROUND) +DAG_FUNCTION(sin, 1, 1, experimental_constrained_sin, FSIN) +DAG_FUNCTION(sqrt, 1, 1, experimental_constrained_sqrt, FSQRT) +DAG_FUNCTION(trunc, 1, 0, experimental_constrained_trunc, FTRUNC) + +// This is definition for fmuladd intrinsic function, that is converted into +// constrained FMA or FMUL + FADD intrinsics. +FUNCTION(fmuladd, 3, 1, experimental_constrained_fmuladd) #undef INSTRUCTION #undef FUNCTION #undef CMP_INSTRUCTION +#undef DAG_INSTRUCTION +#undef DAG_FUNCTION Index: llvm/include/llvm/IR/IRBuilder.h =================================================================== --- llvm/include/llvm/IR/IRBuilder.h +++ llvm/include/llvm/IR/IRBuilder.h @@ -2213,9 +2213,9 @@ switch (ID) { default: break; -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ - case Intrinsic::INTRINSIC: \ - HasRoundingMD = ROUND_MODE; \ +#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ + case Intrinsic::INTRINSIC: \ + HasRoundingMD = ROUND_MODE; \ break; #include "llvm/IR/ConstrainedOps.def" } @@ -2484,9 +2484,9 @@ switch (Callee->getIntrinsicID()) { default: break; -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ - case Intrinsic::INTRINSIC: \ - HasRoundingMD = ROUND_MODE; \ +#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ + case Intrinsic::INTRINSIC: \ + HasRoundingMD = ROUND_MODE; \ break; #include "llvm/IR/ConstrainedOps.def" } Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -626,6 +626,13 @@ llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_fmuladd : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_fptosi : Intrinsic<[ llvm_anyint_ty ], [ llvm_anyfloat_ty, llvm_metadata_ty ]>; Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -339,7 +339,7 @@ if (Action == TargetLowering::Legal) Action = TargetLowering::Expand; break; -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: #include "llvm/IR/ConstrainedOps.def" ValVT = Node->getValueType(0); @@ -970,7 +970,7 @@ case ISD::UDIVFIX: Results.push_back(ExpandFixedPointDiv(Node)); return; -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: #include "llvm/IR/ConstrainedOps.def" ExpandStrictFPOp(Node, Results); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -147,7 +147,7 @@ R = ScalarizeVecRes_TernaryOp(N); break; -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: #include "llvm/IR/ConstrainedOps.def" R = ScalarizeVecRes_StrictFPOp(N); @@ -942,7 +942,7 @@ SplitVecRes_TernaryOp(N, Lo, Hi); break; -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: #include "llvm/IR/ConstrainedOps.def" SplitVecRes_StrictFPOp(N, Lo, Hi); @@ -2753,7 +2753,7 @@ Res = WidenVecRes_BinaryWithExtraScalarOp(N); break; -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: #include "llvm/IR/ConstrainedOps.def" Res = WidenVecRes_StrictFP(N); Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7919,7 +7919,7 @@ switch (OrigOpc) { default: llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!"); -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: NewOpc = ISD::DAGN; break; #define CMP_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: NewOpc = ISD::SETCC; break; Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6236,7 +6236,7 @@ getValue(I.getArgOperand(1)), getValue(I.getArgOperand(2)))); return; -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ case Intrinsic::INTRINSIC: #include "llvm/IR/ConstrainedOps.def" visitConstrainedFPIntrinsic(cast(I)); @@ -6999,14 +6999,60 @@ Opers.push_back(getValue(FPI.getArgOperand(1))); } + auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) { + assert(Result.getNode()->getNumValues() == 2); + + // Push node to the appropriate list so that future instructions can be + // chained up correctly. + SDValue OutChain = Result.getValue(1); + switch (EB) { + case fp::ExceptionBehavior::ebIgnore: + // The only reason why ebIgnore nodes still need to be chained is that + // they might depend on the current rounding mode, and therefore must + // not be moved across instruction that may change that mode. + LLVM_FALLTHROUGH; + case fp::ExceptionBehavior::ebMayTrap: + // These must not be moved across calls or instructions that may change + // floating-point exception masks. + PendingConstrainedFP.push_back(OutChain); + break; + case fp::ExceptionBehavior::ebStrict: + // These must not be moved across calls or instructions that may change + // floating-point exception masks or read floating-point exception flags. + // In addition, they cannot be optimized out even if unused. + PendingConstrainedFPStrict.push_back(OutChain); + break; + } + }; + + SDVTList VTs = DAG.getVTList(ValueVTs); + fp::ExceptionBehavior EB = FPI.getExceptionBehavior().getValue(); + unsigned Opcode; switch (FPI.getIntrinsicID()) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case Intrinsic::INTRINSIC: \ Opcode = ISD::STRICT_##DAGN; \ break; #include "llvm/IR/ConstrainedOps.def" + case Intrinsic::experimental_constrained_fmuladd: { + Opcode = ISD::STRICT_FMA; + // Break fmuladd into fmul and fadd. + if (TM.Options.AllowFPOpFusion == FPOpFusion::Strict || + !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), + ValueVTs[0])) { + Opers.pop_back(); + SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers); + pushOutChain(Mul, EB); + Opcode = ISD::STRICT_FADD; + Opers.clear(); + Opers.push_back(Mul.getValue(1)); + Opers.push_back(Mul.getValue(0)); + Opers.push_back(getValue(FPI.getArgOperand(2))); + } + break; + } } // A few strict DAG nodes carry additional operands that are not @@ -7025,32 +7071,8 @@ } } - SDVTList VTs = DAG.getVTList(ValueVTs); SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers); - - assert(Result.getNode()->getNumValues() == 2); - - // Push node to the appropriate list so that future instructions can be - // chained up correctly. - SDValue OutChain = Result.getValue(1); - switch (FPI.getExceptionBehavior().getValue()) { - case fp::ExceptionBehavior::ebIgnore: - // The only reason why ebIgnore nodes still need to be chained is that - // they might depend on the current rounding mode, and therefore must - // not be moved across instruction that may change that mode. - LLVM_FALLTHROUGH; - case fp::ExceptionBehavior::ebMayTrap: - // These must not be moved across calls or instructions that may change - // floating-point exception masks. - PendingConstrainedFP.push_back(OutChain); - break; - case fp::ExceptionBehavior::ebStrict: - // These must not be moved across calls or instructions that may change - // floating-point exception masks or read floating-point exception flags. - // In addition, they cannot be optimized out even if unused. - PendingConstrainedFPStrict.push_back(OutChain); - break; - } + pushOutChain(Result, EB); SDValue FPResult = Result.getValue(0); setValue(&FPI, FPResult); Index: llvm/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -702,7 +702,7 @@ } // Constrained floating-point operations default to expand. -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ setOperationAction(ISD::STRICT_##DAGN, VT, Expand); #include "llvm/IR/ConstrainedOps.def" Index: llvm/lib/IR/IntrinsicInst.cpp =================================================================== --- llvm/lib/IR/IntrinsicInst.cpp +++ llvm/lib/IR/IntrinsicInst.cpp @@ -149,7 +149,7 @@ switch (getIntrinsicID()) { default: return false; -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ case Intrinsic::INTRINSIC: \ return NARG == 1; #include "llvm/IR/ConstrainedOps.def" @@ -160,7 +160,7 @@ switch (getIntrinsicID()) { default: return false; -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ case Intrinsic::INTRINSIC: \ return NARG == 3; #include "llvm/IR/ConstrainedOps.def" @@ -169,7 +169,7 @@ bool ConstrainedFPIntrinsic::classof(const IntrinsicInst *I) { switch (I->getIntrinsicID()) { -#define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC, DAGN) \ +#define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC) \ case Intrinsic::INTRINSIC: #include "llvm/IR/ConstrainedOps.def" return true; Index: llvm/lib/IR/Verifier.cpp =================================================================== --- llvm/lib/IR/Verifier.cpp +++ llvm/lib/IR/Verifier.cpp @@ -4318,7 +4318,7 @@ "an array"); break; } -#define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC, DAGN) \ +#define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC) \ case Intrinsic::INTRINSIC: #include "llvm/IR/ConstrainedOps.def" visitConstrainedFPIntrinsic(cast(Call)); @@ -4754,7 +4754,7 @@ unsigned NumOperands; bool HasRoundingMD; switch (FPI.getIntrinsicID()) { -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ +#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ case Intrinsic::INTRINSIC: \ NumOperands = NARG; \ HasRoundingMD = ROUND_MODE; \ Index: llvm/test/CodeGen/X86/fp-intrinsics-fma.ll =================================================================== --- llvm/test/CodeGen/X86/fp-intrinsics-fma.ll +++ llvm/test/CodeGen/X86/fp-intrinsics-fma.ll @@ -3,6 +3,104 @@ ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=COMMON,FMA ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=COMMON,FMA +; Verify constrained fmul and fadd aren't fused. +define float @f11(float %0, float %1, float %2) #0 { +; NOFMA-LABEL: f11: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f11: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; FMA-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; FMA-NEXT: retq +entry: + %3 = call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %4 = call float @llvm.experimental.constrained.fadd.f32(float %3, float %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %4 +} + +; Verify constrained fmul and fadd aren't fused. +define double @f12(double %0, double %1, double %2) #0 { +; NOFMA-LABEL: f12: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f12: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; FMA-NEXT: vaddsd %xmm2, %xmm0, %xmm0 +; FMA-NEXT: retq +entry: + %3 = call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %4 = call double @llvm.experimental.constrained.fadd.f64(double %3, double %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %4 +} + +; Verify that fmuladd(3.5) isn't simplified when the rounding mode is +; unknown. +define float @f15() #0 { +; NOFMA-LABEL: f15: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NOFMA-NEXT: movaps %xmm1, %xmm0 +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm1, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f15: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 +; FMA-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.fmuladd.f32( + float 3.5, + float 3.5, + float 3.5, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +; Verify that fmuladd(42.1) isn't simplified when the rounding mode is +; unknown. +define double @f16() #0 { +; NOFMA-LABEL: f16: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NOFMA-NEXT: movapd %xmm1, %xmm0 +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm1, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f16: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 +; FMA-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.fmuladd.f64( + double 42.1, + double 42.1, + double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + ; Verify that fma(3.5) isn't simplified when the rounding mode is ; unknown. define float @f17() #0 { @@ -65,5 +163,11 @@ attributes #0 = { strictfp } +declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) +declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) +declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) +declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata)