diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -3361,7 +3361,7 @@ // the add operand respectively. This allows fmuladd to represent a*b-c, or // c-a*b. Patterns in LLVM should catch the negated forms and translate them to // efficient operations. -static Value* buildFMulAdd(llvm::BinaryOperator *MulOp, Value *Addend, +static Value* buildFMulAdd(llvm::Instruction *MulOp, Value *Addend, const CodeGenFunction &CGF, CGBuilderTy &Builder, bool negMul, bool negAdd) { assert(!(negMul && negAdd) && "Only one of negMul and negAdd should be set."); @@ -3373,12 +3373,19 @@ if (negAdd) Addend = Builder.CreateFNeg(Addend, "neg"); - Value *FMulAdd = Builder.CreateCall( - CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()), - {MulOp0, MulOp1, Addend}); - MulOp->eraseFromParent(); + Value *FMulAdd = nullptr; + if (Builder.getIsFPConstrained()) + FMulAdd = Builder.CreateCall( + CGF.CGM.getIntrinsic(llvm::Intrinsic::experimental_constrained_fmuladd, + Addend->getType()), + {MulOp0, MulOp1, Addend, MulOp->getOperand(2), MulOp->getOperand(3)}); + else + FMulAdd = Builder.CreateCall( + CGF.CGM.getIntrinsic(llvm::Intrinsic::fmuladd, Addend->getType()), + {MulOp0, MulOp1, Addend}); + MulOp->eraseFromParent(); - return FMulAdd; + return FMulAdd; } // Check whether it would be legal to emit an fmuladd intrinsic call to @@ -3413,6 +3420,21 @@ return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false); } + if (Builder.getIsFPConstrained()) { + if (auto *LHSBinOp = dyn_cast(op.LHS)) { + if (LHSBinOp->getIntrinsicID() == + llvm::Intrinsic::experimental_constrained_fmul && + LHSBinOp->use_empty()) + return buildFMulAdd(LHSBinOp, op.RHS, CGF, Builder, false, isSub); + } + if (auto *RHSBinOp = dyn_cast(op.RHS)) { + if (RHSBinOp->getIntrinsicID() == + llvm::Intrinsic::experimental_constrained_fmul && + RHSBinOp->use_empty()) + return buildFMulAdd(RHSBinOp, op.LHS, CGF, Builder, isSub, false); + } + } + return nullptr; } diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c --- a/clang/test/CodeGen/constrained-math-builtins.c +++ b/clang/test/CodeGen/constrained-math-builtins.c @@ -148,3 +148,13 @@ // CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata) }; +#pragma STDC FP_CONTRACT ON +void bar(float f) { + f * f + f; + (double)f * f + f; + (long double)f * f + f; + +// CHECK: declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata) +// CHECK: declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata) +// CHECK: declare x86_fp80 @llvm.experimental.constrained.fmuladd.f80(x86_fp80, x86_fp80, x86_fp80, metadata, metadata) +}; diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -16061,6 +16061,63 @@ performed by '``llvm.experimental.constrained.fcmps``' will raise an exception if either operand is a NAN (QNAN or SNAN). +'``llvm.experimental.constrained.fmuladd``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.fmuladd( , , + , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.fmuladd``' intrinsic represents +multiply-add expressions that can be fused if the code generator determines +that (a) the target instruction set has support for a fused operation, +and (b) that the fused operation is more efficient than the equivalent, +separate pair of mul and add instructions. + +Arguments: +"""""""""" + +The first three arguments to the '``llvm.experimental.constrained.fmuladd``' +intrinsic must be floating-point or vector of floating-point values. +All three arguments must have identical types. + +The fourth and fifth arguments specifie the exception behavior as described +above. + +Semantics: +"""""""""" + +The expression: + +:: + + %0 = call float @llvm.experimental.constrained.fmuladd.f32(%a, %b, %c) + +is equivalent to the expression: + +:: + + %0 = call float @llvm.experimental.constrained.fmul.f32(%a, %b) + %1 = call float @llvm.experimental.constrained.fadd.f32(%0, %c) + +except that it is unspecified whether rounding will be performed between the +multiplication and addition steps. Fusion is not guaranteed, even if the target +platform supports it. +If a fused multiply-add is required, the corresponding +:ref:`llvm.experimental.constrained.fma ` intrinsic function should be +used instead. +This never sets errno, just as '``llvm.experimental.constrained.fma.*``'. + Constrained libm-equivalent Intrinsics -------------------------------------- diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1286,6 +1286,9 @@ case Intrinsic::fmuladd: ISDs.push_back(ISD::FMA); break; + case Intrinsic::experimental_constrained_fmuladd: + ISDs.push_back(ISD::STRICT_FMA); + break; // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: @@ -1509,6 +1512,13 @@ if (IID == Intrinsic::fmuladd) return ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) + ConcreteTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy); + // FIXME: Is constrained intrinsic' cost equal to it's no strict one? + if (IID == Intrinsic::experimental_constrained_fmuladd) + return ConcreteTTI->getIntrinsicCost( + Intrinsic::experimental_constrained_fmul, RetTy, Tys, + nullptr) + + ConcreteTTI->getIntrinsicCost( + Intrinsic::experimental_constrained_fadd, RetTy, Tys, nullptr); // Else, assume that we need to scalarize this intrinsic. For math builtins // this will emit a costly libcall, adding call overhead and spills. Make it diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -352,6 +352,10 @@ /// comparison operation. STRICT_FSETCC, STRICT_FSETCCS, + /// FMULADD/STRICT_FMULADD - A intermediate node, made functions handle + /// constrained fmuladd the same as other constrained intrinsics. + FMULADD, STRICT_FMULADD, + /// FMA - Perform a * b + c with no intermediate rounding step. FMA, diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def --- a/llvm/include/llvm/IR/ConstrainedOps.def +++ b/llvm/include/llvm/IR/ConstrainedOps.def @@ -81,6 +81,10 @@ FUNCTION(sqrt, 1, 1, experimental_constrained_sqrt, FSQRT) FUNCTION(trunc, 1, 0, experimental_constrained_trunc, FTRUNC) +// This is definition for fmuladd intrinsic function, that is converted into +// constrained FMA or FMUL + FADD intrinsics. +FUNCTION(fmuladd, 3, 1, experimental_constrained_fmuladd, FMULADD) + #undef INSTRUCTION #undef FUNCTION #undef CMP_INSTRUCTION diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -626,6 +626,13 @@ llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_fmuladd : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_fptosi : Intrinsic<[ llvm_anyint_ty ], [ llvm_anyfloat_ty, llvm_metadata_ty ]>; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7059,31 +7059,53 @@ } SDVTList VTs = DAG.getVTList(ValueVTs); - SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers); - - assert(Result.getNode()->getNumValues() == 2); - - // Push node to the appropriate list so that future instructions can be - // chained up correctly. - SDValue OutChain = Result.getValue(1); - switch (FPI.getExceptionBehavior().getValue()) { - case fp::ExceptionBehavior::ebIgnore: - // The only reason why ebIgnore nodes still need to be chained is that - // they might depend on the current rounding mode, and therefore must - // not be moved across instruction that may change that mode. - LLVM_FALLTHROUGH; - case fp::ExceptionBehavior::ebMayTrap: - // These must not be moved across calls or instructions that may change - // floating-point exception masks. - PendingConstrainedFP.push_back(OutChain); - break; - case fp::ExceptionBehavior::ebStrict: - // These must not be moved across calls or instructions that may change - // floating-point exception masks or read floating-point exception flags. - // In addition, they cannot be optimized out even if unused. - PendingConstrainedFPStrict.push_back(OutChain); - break; - } + SDValue Result; + + auto pushOutChain = [&]() { + assert(Result.getNode()->getNumValues() == 2); + + // Push node to the appropriate list so that future instructions can be + // chained up correctly. + SDValue OutChain = Result.getValue(1); + switch (FPI.getExceptionBehavior().getValue()) { + case fp::ExceptionBehavior::ebIgnore: + // The only reason why ebIgnore nodes still need to be chained is that + // they might depend on the current rounding mode, and therefore must + // not be moved across instruction that may change that mode. + LLVM_FALLTHROUGH; + case fp::ExceptionBehavior::ebMayTrap: + // These must not be moved across calls or instructions that may change + // floating-point exception masks. + PendingConstrainedFP.push_back(OutChain); + break; + case fp::ExceptionBehavior::ebStrict: + // These must not be moved across calls or instructions that may change + // floating-point exception masks or read floating-point exception flags. + // In addition, they cannot be optimized out even if unused. + PendingConstrainedFPStrict.push_back(OutChain); + break; + } + }; + + if (Opcode == ISD::STRICT_FMULADD) { + Opcode = ISD::STRICT_FMA; + // Break fmuladd into fmul and fadd. + if (TM.Options.AllowFPOpFusion == FPOpFusion::Strict || + !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), + ValueVTs[0])) { + Opers.pop_back(); + Result = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers); + pushOutChain(); + Opcode = ISD::STRICT_FADD; + Opers.clear(); + Opers.push_back(Result.getValue(1)); + Opers.push_back(Result.getValue(0)); + Opers.push_back(getValue(FPI.getArgOperand(2))); + } + } + + Result = DAG.getNode(Opcode, sdl, VTs, Opers); + pushOutChain(); SDValue FPResult = Result.getValue(0); setValue(&FPI, FPResult); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -468,9 +468,9 @@ // FMA nodes. // We use the target independent ISD::FMA for the non-inverted case. - FNMADD, - FMSUB, - FNMSUB, + FNMADD, STRICT_FNMADD, + FMSUB, STRICT_FMSUB, + FNMSUB, STRICT_FNMSUB, FMADDSUB, FMSUBADD, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2000,6 +2000,7 @@ setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); + setTargetDAGCombine(ISD::STRICT_FMA); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); @@ -29817,8 +29818,11 @@ case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; + case X86ISD::STRICT_FMSUB: return "X86ISD::STRICT_FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; + case X86ISD::STRICT_FNMADD: return "X86ISD::STRICT_FNMADD"; case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; + case X86ISD::STRICT_FNMSUB: return "X86ISD::STRICT_FNMSUB"; case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND"; @@ -42514,37 +42518,46 @@ if (NegMul) { switch (Opcode) { default: llvm_unreachable("Unexpected opcode"); - case ISD::FMA: Opcode = X86ISD::FNMADD; break; - case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break; - case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; - case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; - case X86ISD::FNMADD: Opcode = ISD::FMA; break; - case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; - case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; - case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; + case ISD::FMA: Opcode = X86ISD::FNMADD; break; + case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break; + case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break; + case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; + case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break; + case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FNMADD: Opcode = ISD::FMA; break; + case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break; + case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; + case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; + case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break; + case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; } } if (NegAcc) { switch (Opcode) { default: llvm_unreachable("Unexpected opcode"); - case ISD::FMA: Opcode = X86ISD::FMSUB; break; - case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break; - case X86ISD::FMSUB: Opcode = ISD::FMA; break; - case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; - case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; - case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; - case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; - case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; - case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break; - case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break; - case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break; - case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break; + case ISD::FMA: Opcode = X86ISD::FMSUB; break; + case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break; + case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break; + case X86ISD::FMSUB: Opcode = ISD::FMA; break; + case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break; + case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; + case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; + case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break; + case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; + case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break; + case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; + case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break; + case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break; + case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break; + case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break; } } if (NegRes) { switch (Opcode) { + // For accuracy reason, we never combine fneg and fma under strict FP. default: llvm_unreachable("Unexpected opcode"); case ISD::FMA: Opcode = X86ISD::FNMSUB; break; case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; @@ -43516,6 +43529,7 @@ const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); + bool IsStrict = N->isStrictFPOpcode(); // Let legalize expand this if it isn't a legal type yet. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -43526,9 +43540,9 @@ if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA()) return SDValue(); - SDValue A = N->getOperand(0); - SDValue B = N->getOperand(1); - SDValue C = N->getOperand(2); + SDValue A = N->getOperand(IsStrict ? 1 : 0); + SDValue B = N->getOperand(IsStrict ? 2 : 1); + SDValue C = N->getOperand(IsStrict ? 3 : 2); auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) { bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); @@ -43566,9 +43580,15 @@ unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false); - if (N->getNumOperands() == 4) - return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); - return DAG.getNode(NewOpcode, dl, VT, A, B, C); + if (IsStrict) { + assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4"); + return DAG.getNode(NewOpcode, dl, {VT, MVT::Other}, + {N->getOperand(0), A, B, C}); + } else { + if (N->getNumOperands() == 4) + return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); + return DAG.getNode(NewOpcode, dl, VT, A, B, C); + } } // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C) @@ -46071,12 +46091,16 @@ case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); case X86ISD::FMADD_RND: case X86ISD::FMSUB: + case X86ISD::STRICT_FMSUB: case X86ISD::FMSUB_RND: case X86ISD::FNMADD: + case X86ISD::STRICT_FNMADD: case X86ISD::FNMADD_RND: case X86ISD::FNMSUB: + case X86ISD::STRICT_FNMSUB: case X86ISD::FNMSUB_RND: - case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget); + case ISD::FMA: + case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget); case X86ISD::FMADDSUB_RND: case X86ISD::FMSUBADD_RND: case X86ISD::FMADDSUB: diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -6487,11 +6487,11 @@ } defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd, X86FmaddRnd>; -defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>; +defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86any_Fmsub, X86FmsubRnd>; defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>; defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>; -defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>; -defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>; +defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86any_Fnmadd, X86FnmaddRnd>; +defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86any_Fnmsub, X86FnmsubRnd>; multiclass avx512_fma3p_231_rm opc, string OpcodeStr, SDNode OpNode, @@ -6565,11 +6565,11 @@ } defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd, X86FmaddRnd>; -defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>; +defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86any_Fmsub, X86FmsubRnd>; defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>; defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>; -defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>; -defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>; +defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86any_Fnmadd, X86FnmaddRnd>; +defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86any_Fnmsub, X86FnmsubRnd>; multiclass avx512_fma3p_132_rm opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, @@ -6645,11 +6645,11 @@ } defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd, X86FmaddRnd>; -defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>; +defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86any_Fmsub, X86FmsubRnd>; defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>; defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>; -defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>; -defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>; +defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86any_Fnmadd, X86FnmaddRnd>; +defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86any_Fnmsub, X86FnmsubRnd>; // Scalar FMA multiclass avx512_fma3s_common opc, string OpcodeStr, X86VectorVTInfo _, @@ -6742,9 +6742,9 @@ } defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86any_Fmadd, X86FmaddRnd>; -defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>; -defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; -defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; +defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86any_Fmsub, X86FmsubRnd>; +defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86any_Fnmadd, X86FnmaddRnd>; +defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86FnmsubRnd>; multiclass avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; defm : avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; -defm : avx512_scalar_fma_patterns; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td --- a/llvm/lib/Target/X86/X86InstrFMA.td +++ b/llvm/lib/Target/X86/X86InstrFMA.td @@ -126,7 +126,7 @@ loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32, SchedWriteFMA>; defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS", - loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32, + loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32, SchedWriteFMA>; defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS", loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32, @@ -141,7 +141,7 @@ loadv2f64, loadv4f64, X86any_Fmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W; defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD", - loadv2f64, loadv4f64, X86Fmsub, v2f64, + loadv2f64, loadv4f64, X86any_Fmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W; defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD", loadv2f64, loadv4f64, X86Fmaddsub, @@ -154,15 +154,15 @@ // Fused Negative Multiply-Add let ExeDomain = SSEPackedSingle in { defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32, - loadv8f32, X86Fnmadd, v4f32, v8f32, SchedWriteFMA>; + loadv8f32, X86any_Fnmadd, v4f32, v8f32, SchedWriteFMA>; defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32, - loadv8f32, X86Fnmsub, v4f32, v8f32, SchedWriteFMA>; + loadv8f32, X86any_Fnmsub, v4f32, v8f32, SchedWriteFMA>; } let ExeDomain = SSEPackedDouble in { defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64, - loadv4f64, X86Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W; + loadv4f64, X86any_Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W; defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64, - loadv4f64, X86Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W; + loadv4f64, X86any_Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W; } // All source register operands of FMA opcodes defined in fma3s_rm multiclass @@ -321,12 +321,12 @@ defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd, SchedWriteFMA.Scl>, VEX_LIG; -defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub, +defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub, SchedWriteFMA.Scl>, VEX_LIG; -defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadd, +defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86any_Fnmadd, SchedWriteFMA.Scl>, VEX_LIG; -defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsub, +defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86any_Fnmsub, SchedWriteFMA.Scl>, VEX_LIG; multiclass scalar_fma_patterns; -defm : scalar_fma_patterns; -defm : scalar_fma_patterns; -defm : scalar_fma_patterns; +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; defm : scalar_fma_patterns; -defm : scalar_fma_patterns; -defm : scalar_fma_patterns; -defm : scalar_fma_patterns; +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; //===----------------------------------------------------------------------===// // FMA4 - AMD 4 operand Fused Multiply-Add instructions @@ -542,26 +542,26 @@ SchedWriteFMA.Scl>, fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, SchedWriteFMA.Scl>; - defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32, + defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86any_Fmsub, loadf32, SchedWriteFMA.Scl>, fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, SchedWriteFMA.Scl>; defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, - X86Fnmadd, loadf32, SchedWriteFMA.Scl>, + X86any_Fnmadd, loadf32, SchedWriteFMA.Scl>, fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, SchedWriteFMA.Scl>; defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, - X86Fnmsub, loadf32, SchedWriteFMA.Scl>, + X86any_Fnmsub, loadf32, SchedWriteFMA.Scl>, fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, SchedWriteFMA.Scl>; // Packed Instructions defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; - defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, + defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; - defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32, + defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86any_Fnmadd, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; - defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32, + defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86any_Fnmsub, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; @@ -575,26 +575,26 @@ SchedWriteFMA.Scl>, fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, SchedWriteFMA.Scl>; - defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64, + defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86any_Fmsub, loadf64, SchedWriteFMA.Scl>, fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, SchedWriteFMA.Scl>; defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, - X86Fnmadd, loadf64, SchedWriteFMA.Scl>, + X86any_Fnmadd, loadf64, SchedWriteFMA.Scl>, fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, SchedWriteFMA.Scl>; defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, - X86Fnmsub, loadf64, SchedWriteFMA.Scl>, + X86any_Fnmsub, loadf64, SchedWriteFMA.Scl>, fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, SchedWriteFMA.Scl>; // Packed Instructions defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; - defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, + defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; - defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64, + defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86any_Fnmadd, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; - defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64, + defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86any_Fnmsub, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; @@ -630,11 +630,11 @@ } defm : scalar_fma4_patterns; -defm : scalar_fma4_patterns; -defm : scalar_fma4_patterns; -defm : scalar_fma4_patterns; +defm : scalar_fma4_patterns; +defm : scalar_fma4_patterns; +defm : scalar_fma4_patterns; defm : scalar_fma4_patterns; -defm : scalar_fma4_patterns; -defm : scalar_fma4_patterns; -defm : scalar_fma4_patterns; +defm : scalar_fma4_patterns; +defm : scalar_fma4_patterns; +defm : scalar_fma4_patterns; diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -535,8 +535,20 @@ [(X86strict_Fmadd node:$src1, node:$src2, node:$src3), (X86Fmadd node:$src1, node:$src2, node:$src3)]>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>; +def X86strict_Fnmadd : SDNode<"X86ISD::STRICT_FNMADD", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>; +def X86any_Fnmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3), + [(X86strict_Fnmadd node:$src1, node:$src2, node:$src3), + (X86Fnmadd node:$src1, node:$src2, node:$src3)]>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>; +def X86strict_Fmsub : SDNode<"X86ISD::STRICT_FMSUB", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>; +def X86any_Fmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3), + [(X86strict_Fmsub node:$src1, node:$src2, node:$src3), + (X86Fmsub node:$src1, node:$src2, node:$src3)]>; def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>; +def X86strict_Fnmsub : SDNode<"X86ISD::STRICT_FNMSUB", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>; +def X86any_Fnmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3), + [(X86strict_Fnmsub node:$src1, node:$src2, node:$src3), + (X86Fnmsub node:$src1, node:$src2, node:$src3)]>; def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp, [SDNPCommutative]>; diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll @@ -1,7 +1,339 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s --check-prefixes=COMMON,NOFMA -; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=COMMON,FMA -; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=COMMON,FMA +; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=COMMON,FMA,FMA-AVX1 +; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=COMMON,FMA,FMA-AVX512 + +define float @f1(float %0, float %1, float %2) #0 { +; NOFMA-LABEL: f1: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f1: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 +; FMA-NEXT: retq +entry: + %3 = fneg float %0 + %result = call float @llvm.experimental.constrained.fmuladd.f32(float %3, float %1, float %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define double @f2(double %0, double %1, double %2) #0 { +; NOFMA-LABEL: f2: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: xorpd {{.*}}(%rip), %xmm0 +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f2: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 +; FMA-NEXT: retq +entry: + %3 = fneg double %0 + %result = call double @llvm.experimental.constrained.fmuladd.f64(double %3, double %1, double %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @f3(float %0, float %1, float %2) #0 { +; NOFMA-LABEL: f3: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2 +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f3: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 +; FMA-NEXT: retq +entry: + %3 = fneg float %2 + %result = call float @llvm.experimental.constrained.fmuladd.f32(float %0, float %1, float %3, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define double @f4(double %0, double %1, double %2) #0 { +; NOFMA-LABEL: f4: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: xorpd {{.*}}(%rip), %xmm2 +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f4: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 +; FMA-NEXT: retq +entry: + %3 = fneg double %2 + %result = call double @llvm.experimental.constrained.fmuladd.f64(double %0, double %1, double %3, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @f5(float %0, float %1, float %2) #0 { +; NOFMA-LABEL: f5: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; NOFMA-NEXT: xorps %xmm3, %xmm0 +; NOFMA-NEXT: xorps %xmm3, %xmm2 +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f5: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 +; FMA-NEXT: retq +entry: + %3 = fneg float %0 + %4 = fneg float %2 + %result = call float @llvm.experimental.constrained.fmuladd.f32(float %3, float %1, float %4, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define double @f6(double %0, double %1, double %2) #0 { +; NOFMA-LABEL: f6: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movapd {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; NOFMA-NEXT: xorpd %xmm3, %xmm0 +; NOFMA-NEXT: xorpd %xmm3, %xmm2 +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f6: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 +; FMA-NEXT: retq +entry: + %3 = fneg double %0 + %4 = fneg double %2 + %result = call double @llvm.experimental.constrained.fmuladd.f64(double %3, double %1, double %4, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @f7(float %0, float %1, float %2) #0 { +; NOFMA-LABEL: f7: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm2, %xmm0 +; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0 +; NOFMA-NEXT: retq +; +; FMA-AVX1-LABEL: f7: +; FMA-AVX1: # %bb.0: # %entry +; FMA-AVX1-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-AVX1-NEXT: retq +; +; FMA-AVX512-LABEL: f7: +; FMA-AVX512: # %bb.0: # %entry +; FMA-AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; FMA-AVX512-NEXT: retq +entry: + %3 = call float @llvm.experimental.constrained.fmuladd.f32(float %0, float %1, float %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %result = fneg float %3 + ret float %result +} + +define double @f8(double %0, double %1, double %2) #0 { +; NOFMA-LABEL: f8: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm2, %xmm0 +; NOFMA-NEXT: xorpd {{.*}}(%rip), %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f8: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: retq +entry: + %3 = call double @llvm.experimental.constrained.fmuladd.f64(double %0, double %1, double %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %result = fneg double %3 + ret double %result +} + +define float @f9(float %0, float %1, float %2) #0 { +; NOFMA-LABEL: f9: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; NOFMA-NEXT: xorps %xmm3, %xmm0 +; NOFMA-NEXT: xorps %xmm3, %xmm2 +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm2, %xmm0 +; NOFMA-NEXT: xorps %xmm3, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-AVX1-LABEL: f9: +; FMA-AVX1: # %bb.0: # %entry +; FMA-AVX1-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 +; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-AVX1-NEXT: retq +; +; FMA-AVX512-LABEL: f9: +; FMA-AVX512: # %bb.0: # %entry +; FMA-AVX512-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 +; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; FMA-AVX512-NEXT: retq +entry: + %3 = fneg float %0 + %4 = fneg float %2 + %5 = call float @llvm.experimental.constrained.fmuladd.f32(float %3, float %1, float %4, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %result = fneg float %5 + ret float %result +} + +define double @f10(double %0, double %1, double %2) #0 { +; NOFMA-LABEL: f10: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movapd {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; NOFMA-NEXT: xorpd %xmm3, %xmm0 +; NOFMA-NEXT: xorpd %xmm3, %xmm2 +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm2, %xmm0 +; NOFMA-NEXT: xorpd %xmm3, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f10: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 +; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: retq +entry: + %3 = fneg double %0 + %4 = fneg double %2 + %5 = call double @llvm.experimental.constrained.fmuladd.f64(double %3, double %1, double %4, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %result = fneg double %5 + ret double %result +} + +; Verify constrained fmul and fadd aren't fused. +define float @f11(float %0, float %1, float %2) #0 { +; NOFMA-LABEL: f11: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f11: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; FMA-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; FMA-NEXT: retq +entry: + %3 = call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %4 = call float @llvm.experimental.constrained.fadd.f32(float %3, float %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %4 +} + +; Verify constrained fmul and fadd aren't fused. +define double @f12(double %0, double %1, double %2) #0 { +; NOFMA-LABEL: f12: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm2, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f12: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; FMA-NEXT: vaddsd %xmm2, %xmm0, %xmm0 +; FMA-NEXT: retq +entry: + %3 = call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + %4 = call double @llvm.experimental.constrained.fadd.f64(double %3, double %2, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %4 +} + +; Verify that fmuladd(3.5) isn't simplified when the rounding mode is +; unknown. +define float @f15() #0 { +; NOFMA-LABEL: f15: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NOFMA-NEXT: movaps %xmm1, %xmm0 +; NOFMA-NEXT: mulss %xmm1, %xmm0 +; NOFMA-NEXT: addss %xmm1, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f15: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 +; FMA-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.fmuladd.f32( + float 3.5, + float 3.5, + float 3.5, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +; Verify that fmuladd(42.1) isn't simplified when the rounding mode is +; unknown. +define double @f16() #0 { +; NOFMA-LABEL: f16: +; NOFMA: # %bb.0: # %entry +; NOFMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NOFMA-NEXT: movapd %xmm1, %xmm0 +; NOFMA-NEXT: mulsd %xmm1, %xmm0 +; NOFMA-NEXT: addsd %xmm1, %xmm0 +; NOFMA-NEXT: retq +; +; FMA-LABEL: f16: +; FMA: # %bb.0: # %entry +; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 +; FMA-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.fmuladd.f64( + double 42.1, + double 42.1, + double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} ; Verify that fma(3.5) isn't simplified when the rounding mode is ; unknown. @@ -65,5 +397,11 @@ attributes #0 = { strictfp } +declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) +declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) +declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) +declare float @llvm.experimental.constrained.fmuladd.f32(float, float, float, metadata, metadata) +declare double @llvm.experimental.constrained.fmuladd.f64(double, double, double, metadata, metadata)