Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -13021,6 +13021,41 @@ value operands and has the same type as the operands. The remainder has the same sign as the dividend. +'``llvm.experimental.constrained.fma``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.fma( , , , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.fma``' intrinsic returns the result of a +fused-multiply-add operation on its operands. + +Arguments: +"""""""""" + +The first three arguments to the '``llvm.experimental.constrained.fma``' +intrinsic must be :ref:`floating point ` or :ref:`vector +` of floating point values. All arguments must have identical types. + +The fourth and fifth arguments specify the rounding mode and exception behavior +as described above. + +Semantics: +"""""""""" + +The result produced is the product of the first two operands added to the third +operand computed with infinite precision, and then rounded to the target +precision. Constrained libm-equivalent Intrinsics -------------------------------------- Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -263,6 +263,7 @@ /// They are used to limit optimizations while the DAG is being /// optimized. STRICT_FADD, STRICT_FSUB, STRICT_FMUL, STRICT_FDIV, STRICT_FREM, + STRICT_FMA, /// Constrained versions of libm-equivalent floating point intrinsics. /// These will be lowered to the equivalent non-constrained pseudo-op Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -623,13 +623,14 @@ /// Test if this node is a strict floating point pseudo-op. bool isStrictFPOpcode() { switch (NodeType) { - default: + default: return false; case ISD::STRICT_FADD: case ISD::STRICT_FSUB: case ISD::STRICT_FMUL: case ISD::STRICT_FDIV: case ISD::STRICT_FREM: + case ISD::STRICT_FMA: case ISD::STRICT_FSQRT: case ISD::STRICT_FPOW: case ISD::STRICT_FPOWI: Index: include/llvm/IR/IntrinsicInst.h =================================================================== --- include/llvm/IR/IntrinsicInst.h +++ include/llvm/IR/IntrinsicInst.h @@ -167,6 +167,7 @@ }; bool isUnaryOp() const; + bool isTernaryOp() const; RoundingMode getRoundingMode() const; ExceptionBehavior getExceptionBehavior() const; @@ -178,6 +179,7 @@ case Intrinsic::experimental_constrained_fmul: case Intrinsic::experimental_constrained_fdiv: case Intrinsic::experimental_constrained_frem: + case Intrinsic::experimental_constrained_fma: case Intrinsic::experimental_constrained_sqrt: case Intrinsic::experimental_constrained_pow: case Intrinsic::experimental_constrained_powi: Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -490,6 +490,13 @@ llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_fma : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + // These intrinsics are sensitive to the rounding mode so we need constrained // versions of each of them. When strict rounding and exception control are // not required the non-constrained versions of these intrinsics should be Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -907,6 +907,7 @@ case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break; case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break; case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break; + case ISD::STRICT_FMA: EqOpc = ISD::FMA; break; case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break; case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break; case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break; @@ -1072,6 +1073,7 @@ } break; case ISD::STRICT_FSQRT: + case ISD::STRICT_FMA: case ISD::STRICT_FPOW: case ISD::STRICT_FPOWI: case ISD::STRICT_FSIN: @@ -1240,7 +1242,7 @@ // If the index is dependent on the store we will introduce a cycle when // creating the load (the load uses the index, and by replacing the chain // we will make the index dependent on the load). Also, the store might be - // dependent on the extractelement and introduce a cycle when creating + // dependent on the extractelement and introduce a cycle when creating // the load. if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) || ST->hasPredecessor(Op.getNode())) @@ -4065,6 +4067,10 @@ Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64, RTLIB::FMA_F80, RTLIB::FMA_F128, RTLIB::FMA_PPCF128)); + case ISD::STRICT_FMA: + Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64, + RTLIB::FMA_F80, RTLIB::FMA_F128, + RTLIB::FMA_PPCF128)); break; case ISD::FADD: Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64, Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6695,6 +6695,7 @@ unsigned OrigOpc = Node->getOpcode(); unsigned NewOpc; bool IsUnary = false; + bool IsTernary = false; switch (OrigOpc) { default: llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!"); @@ -6703,6 +6704,7 @@ case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break; case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break; case ISD::STRICT_FREM: NewOpc = ISD::FREM; break; + case ISD::STRICT_FMA: NewOpc = ISD::FMA; IsTernary = true; break; case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; IsUnary = true; break; case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break; case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break; @@ -6729,6 +6731,10 @@ SDNode *Res = nullptr; if (IsUnary) Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1) }); + else if (IsTernary) + Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1), + Node->getOperand(2), + Node->getOperand(3)}); else Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1), Node->getOperand(2) }); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5432,6 +5432,7 @@ case Intrinsic::experimental_constrained_fmul: case Intrinsic::experimental_constrained_fdiv: case Intrinsic::experimental_constrained_frem: + case Intrinsic::experimental_constrained_fma: case Intrinsic::experimental_constrained_sqrt: case Intrinsic::experimental_constrained_pow: case Intrinsic::experimental_constrained_powi: @@ -5963,6 +5964,9 @@ case Intrinsic::experimental_constrained_frem: Opcode = ISD::STRICT_FREM; break; + case Intrinsic::experimental_constrained_fma: + Opcode = ISD::STRICT_FMA; + break; case Intrinsic::experimental_constrained_sqrt: Opcode = ISD::STRICT_FSQRT; break; @@ -6009,10 +6013,15 @@ SDVTList VTs = DAG.getVTList(ValueVTs); SDValue Result; if (FPI.isUnaryOp()) - Result = DAG.getNode(Opcode, sdl, VTs, + Result = DAG.getNode(Opcode, sdl, VTs, { Chain, getValue(FPI.getArgOperand(0)) }); + else if (FPI.isTernaryOp()) + Result = DAG.getNode(Opcode, sdl, VTs, + { Chain, getValue(FPI.getArgOperand(0)), + getValue(FPI.getArgOperand(1)), + getValue(FPI.getArgOperand(2)) }); else - Result = DAG.getNode(Opcode, sdl, VTs, + Result = DAG.getNode(Opcode, sdl, VTs, { Chain, getValue(FPI.getArgOperand(0)), getValue(FPI.getArgOperand(1)) }); Index: lib/IR/IntrinsicInst.cpp =================================================================== --- lib/IR/IntrinsicInst.cpp +++ lib/IR/IntrinsicInst.cpp @@ -14,10 +14,10 @@ // are all subclasses of the CallInst class. Note that none of these classes // has state or virtual methods, which is an important part of this gross/neat // hack working. -// +// // In some cases, arguments to intrinsics need to be generic and are defined as // type pointer to empty struct { }*. To access the real item of interest the -// cast instruction needs to be stripped away. +// cast instruction needs to be stripped away. // //===----------------------------------------------------------------------===// @@ -98,7 +98,7 @@ ConstrainedFPIntrinsic::RoundingMode ConstrainedFPIntrinsic::getRoundingMode() const { unsigned NumOperands = getNumArgOperands(); - Metadata *MD = + Metadata *MD = dyn_cast(getArgOperand(NumOperands - 2))->getMetadata(); if (!MD || !isa(MD)) return rmInvalid; @@ -118,7 +118,7 @@ ConstrainedFPIntrinsic::ExceptionBehavior ConstrainedFPIntrinsic::getExceptionBehavior() const { unsigned NumOperands = getNumArgOperands(); - Metadata *MD = + Metadata *MD = dyn_cast(getArgOperand(NumOperands - 1))->getMetadata(); if (!MD || !isa(MD)) return ebInvalid; @@ -132,7 +132,7 @@ bool ConstrainedFPIntrinsic::isUnaryOp() const { switch (getIntrinsicID()) { - default: + default: return false; case Intrinsic::experimental_constrained_sqrt: case Intrinsic::experimental_constrained_sin: @@ -147,3 +147,13 @@ return true; } } + +bool ConstrainedFPIntrinsic::isTernaryOp() const { + switch (getIntrinsicID()) { + default: + return false; + case Intrinsic::experimental_constrained_fma: + return true; + } +} + Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -3973,6 +3973,7 @@ case Intrinsic::experimental_constrained_fmul: case Intrinsic::experimental_constrained_fdiv: case Intrinsic::experimental_constrained_frem: + case Intrinsic::experimental_constrained_fma: case Intrinsic::experimental_constrained_sqrt: case Intrinsic::experimental_constrained_pow: case Intrinsic::experimental_constrained_powi: @@ -4433,8 +4434,9 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) { unsigned NumOperands = FPI.getNumArgOperands(); - Assert(((NumOperands == 3 && FPI.isUnaryOp()) || (NumOperands == 4)), - "invalid arguments for constrained FP intrinsic", &FPI); + Assert(((NumOperands == 5 && FPI.isTernaryOp()) || + (NumOperands == 3 && FPI.isUnaryOp()) || (NumOperands == 4)), + "invalid arguments for constrained FP intrinsic", &FPI); Assert(isa(FPI.getArgOperand(NumOperands-1)), "invalid exception behavior argument", &FPI); Assert(isa(FPI.getArgOperand(NumOperands-2)), Index: test/CodeGen/X86/fp-intrinsics.ll =================================================================== --- test/CodeGen/X86/fp-intrinsics.ll +++ test/CodeGen/X86/fp-intrinsics.ll @@ -1,4 +1,5 @@ -; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s +; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck --check-prefix=COMMON --check-prefix=NO-FMA --check-prefix=FMACALL64 --check-prefix=FMACALL32 %s +; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck -check-prefix=COMMON --check-prefix=HAS-FMA --check-prefix=FMA64 --check-prefix=FMA32 %s ; Verify that constants aren't folded to inexact results when the rounding mode ; is unknown. @@ -9,7 +10,7 @@ ; } ; ; CHECK-LABEL: f1 -; CHECK: divsd +; COMMON: divsd define double @f1() { entry: %div = call double @llvm.experimental.constrained.fdiv.f64( @@ -29,7 +30,7 @@ ; } ; ; CHECK-LABEL: f2 -; CHECK: subsd +; COMMON: subsd define double @f2(double %a) { entry: %div = call double @llvm.experimental.constrained.fsub.f64( @@ -50,9 +51,9 @@ ; } ; ; CHECK-LABEL: f3: -; CHECK: subsd -; CHECK: mulsd -; CHECK: subsd +; COMMON: subsd +; COMMON: mulsd +; COMMON: subsd define double @f3(double %a, double %b) { entry: %sub = call double @llvm.experimental.constrained.fsub.f64( @@ -81,11 +82,11 @@ ; return a; ; } ; -; +; ; CHECK-LABEL: f4: -; CHECK: testl -; CHECK: jle -; CHECK: addsd +; COMMON: testl +; COMMON: jle +; COMMON: addsd define double @f4(i32 %n, double %a) { entry: %cmp = icmp sgt i32 %n, 0 @@ -105,7 +106,7 @@ ; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f5 -; CHECK: sqrtsd +; COMMON: sqrtsd define double @f5() { entry: %result = call double @llvm.experimental.constrained.sqrt.f64(double 42.0, @@ -116,7 +117,7 @@ ; Verify that pow(42.1, 3.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f6 -; CHECK: pow +; COMMON: pow define double @f6() { entry: %result = call double @llvm.experimental.constrained.pow.f64(double 42.1, @@ -128,7 +129,7 @@ ; Verify that powi(42.1, 3) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f7 -; CHECK: powi +; COMMON: powi define double @f7() { entry: %result = call double @llvm.experimental.constrained.powi.f64(double 42.1, @@ -140,7 +141,7 @@ ; Verify that sin(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f8 -; CHECK: sin +; COMMON: sin define double @f8() { entry: %result = call double @llvm.experimental.constrained.sin.f64(double 42.0, @@ -151,7 +152,7 @@ ; Verify that cos(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f9 -; CHECK: cos +; COMMON: cos define double @f9() { entry: %result = call double @llvm.experimental.constrained.cos.f64(double 42.0, @@ -162,7 +163,7 @@ ; Verify that exp(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f10 -; CHECK: exp +; COMMON: exp define double @f10() { entry: %result = call double @llvm.experimental.constrained.exp.f64(double 42.0, @@ -173,7 +174,7 @@ ; Verify that exp2(42.1) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f11 -; CHECK: exp2 +; COMMON: exp2 define double @f11() { entry: %result = call double @llvm.experimental.constrained.exp2.f64(double 42.1, @@ -184,7 +185,7 @@ ; Verify that log(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f12 -; CHECK: log +; COMMON: log define double @f12() { entry: %result = call double @llvm.experimental.constrained.log.f64(double 42.0, @@ -195,7 +196,7 @@ ; Verify that log10(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f13 -; CHECK: log10 +; COMMON: log10 define double @f13() { entry: %result = call double @llvm.experimental.constrained.log10.f64(double 42.0, @@ -206,7 +207,7 @@ ; Verify that log2(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f14 -; CHECK: log2 +; COMMON: log2 define double @f14() { entry: %result = call double @llvm.experimental.constrained.log2.f64(double 42.0, @@ -217,7 +218,8 @@ ; Verify that rint(42.1) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f15 -; CHECK: rint +; NO-FMA: rint +; HAS-FMA: vroundsd define double @f15() { entry: %result = call double @llvm.experimental.constrained.rint.f64(double 42.1, @@ -229,7 +231,8 @@ ; Verify that nearbyint(42.1) isn't simplified when the rounding mode is ; unknown. ; CHECK-LABEL: f16 -; CHECK: nearbyint +; NO-FMA: nearbyint +; HAS-FMA: vroundsd define double @f16() { entry: %result = call double @llvm.experimental.constrained.nearbyint.f64( @@ -239,6 +242,38 @@ ret double %result } +; Verify that fma(3.5) isn't simplified when the rounding mode is +; unknown. +; CHECK-LABEL: f17 +; FMACALL32: jmp fmaf # TAILCALL +; FMA32: vfmadd213ss +define float @f17() { +entry: + %result = call float @llvm.experimental.constrained.fma.f32( + float 3.5, + float 3.5, + float 3.5, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret float %result +} + +; Verify that fma(42.1) isn't simplified when the rounding mode is +; unknown. +; CHECK-LABEL: f18 +; FMACALL64: jmp fma # TAILCALL +; FMA64: vfmadd213sd +define double @f18() { +entry: + %result = call double @llvm.experimental.constrained.fma.f64( + double 42.1, + double 42.1, + double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) @@ -256,3 +291,5 @@ declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata) +declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) Index: test/Feature/fp-intrinsics.ll =================================================================== --- test/Feature/fp-intrinsics.ll +++ test/Feature/fp-intrinsics.ll @@ -73,7 +73,7 @@ ; return a; ; } ; -; +; ; CHECK-LABEL: @f4 ; CHECK-NOT: select ; CHECK: br i1 %cmp @@ -94,7 +94,6 @@ ret double %a.0 } - ; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f5 ; CHECK: call double @llvm.experimental.constrained.sqrt @@ -231,6 +230,18 @@ ret double %result } +; Verify that fma(42.1) isn't simplified when the rounding mode is +; unknown. +; CHECK-LABEL: f17 +; CHECK: call double @llvm.experimental.constrained.fma +define double @f17() { +entry: + %result = call double @llvm.experimental.constrained.fma.f64(double 42.1, double 42.1, double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) @@ -248,3 +259,4 @@ declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)