Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -15685,6 +15685,84 @@ mode argument is only intended as information to the compiler. +'``llvm.experimental.constrained.lrint``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.lrint( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.lrint``' intrinsic returns the first +operand rounded to the nearest integer. It may raise an inexact floating-point +exception if the operand is not an integer. + +Arguments: +"""""""""" + +The first argument is a floating-point number. The return value is an +integer type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``lrint`` functions +would, and handles error conditions in the same way. The rounding mode is +described, not determined, by the rounding mode argument. The actual rounding +mode is determined by the runtime floating-point environment. The rounding +mode argument is only intended as information to the compiler. + + +'``llvm.experimental.constrained.llrint``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.llrint( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.llrint``' intrinsic returns the first +operand rounded to the nearest integer. It may raise an inexact floating-point +exception if the operand is not an integer. + +Arguments: +"""""""""" + +The first argument is a floating-point number. The return value is an +integer type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``llrint`` functions +would, and handles error conditions in the same way. The rounding mode is +described, not determined, by the rounding mode argument. The actual rounding +mode is determined by the runtime floating-point environment. The rounding +mode argument is only intended as information to the compiler. + + '``llvm.experimental.constrained.nearbyint``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -15907,6 +15985,72 @@ would and handles error conditions in the same way. +'``llvm.experimental.constrained.lround``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.lround( , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.lround``' intrinsic returns the first +operand rounded to the nearest integer. + +Arguments: +"""""""""" + +The first argument is a floating-point number. The return value is an +integer type. + +The second argument specifies the exception behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``lround`` functions +would and handles error conditions in the same way. + + +'``llvm.experimental.constrained.llround``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.llround( , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.llround``' intrinsic returns the first +operand rounded to the nearest integer. + +Arguments: +"""""""""" + +The first argument is a floating-point number. The return value is an +integer type. + +The second argument specifies the exception behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``llround`` functions +would and handles error conditions in the same way. + + '``llvm.experimental.constrained.trunc``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -300,7 +300,8 @@ STRICT_FSQRT, STRICT_FPOW, STRICT_FPOWI, STRICT_FSIN, STRICT_FCOS, STRICT_FEXP, STRICT_FEXP2, STRICT_FLOG, STRICT_FLOG10, STRICT_FLOG2, STRICT_FRINT, STRICT_FNEARBYINT, STRICT_FMAXNUM, STRICT_FMINNUM, - STRICT_FCEIL, STRICT_FFLOOR, STRICT_FROUND, STRICT_FTRUNC, + STRICT_FCEIL, STRICT_FFLOOR, STRICT_LROUND, STRICT_LLROUND, STRICT_FROUND, + STRICT_FTRUNC, STRICT_LRINT, STRICT_LLRINT, /// X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating /// point type down to the precision of the destination VT. TRUNC is a Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -696,12 +696,16 @@ case ISD::STRICT_FLOG: case ISD::STRICT_FLOG10: case ISD::STRICT_FLOG2: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: case ISD::STRICT_FRINT: case ISD::STRICT_FNEARBYINT: case ISD::STRICT_FMAXNUM: case ISD::STRICT_FMINNUM: case ISD::STRICT_FCEIL: case ISD::STRICT_FFLOOR: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: case ISD::STRICT_FROUND: case ISD::STRICT_FTRUNC: case ISD::STRICT_FP_ROUND: Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -911,12 +911,16 @@ case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break; case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break; case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break; + case ISD::STRICT_LRINT: EqOpc = ISD::LRINT; break; + case ISD::STRICT_LLRINT: EqOpc = ISD::LLRINT; break; case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break; case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break; case ISD::STRICT_FMAXNUM: EqOpc = ISD::FMAXNUM; break; case ISD::STRICT_FMINNUM: EqOpc = ISD::FMINNUM; break; case ISD::STRICT_FCEIL: EqOpc = ISD::FCEIL; break; case ISD::STRICT_FFLOOR: EqOpc = ISD::FFLOOR; break; + case ISD::STRICT_LROUND: EqOpc = ISD::LROUND; break; + case ISD::STRICT_LLROUND: EqOpc = ISD::LLROUND; break; case ISD::STRICT_FROUND: EqOpc = ISD::FROUND; break; case ISD::STRICT_FTRUNC: EqOpc = ISD::FTRUNC; break; case ISD::STRICT_FP_ROUND: EqOpc = ISD::FP_ROUND; break; Index: include/llvm/IR/IntrinsicInst.h =================================================================== --- include/llvm/IR/IntrinsicInst.h +++ include/llvm/IR/IntrinsicInst.h @@ -271,12 +271,16 @@ case Intrinsic::experimental_constrained_log: case Intrinsic::experimental_constrained_log10: case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_lrint: + case Intrinsic::experimental_constrained_llrint: case Intrinsic::experimental_constrained_rint: case Intrinsic::experimental_constrained_nearbyint: case Intrinsic::experimental_constrained_maxnum: case Intrinsic::experimental_constrained_minnum: case Intrinsic::experimental_constrained_ceil: case Intrinsic::experimental_constrained_floor: + case Intrinsic::experimental_constrained_lround: + case Intrinsic::experimental_constrained_llround: case Intrinsic::experimental_constrained_round: case Intrinsic::experimental_constrained_trunc: return true; Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -676,6 +676,14 @@ [ LLVMMatchType<0>, llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_lrint : Intrinsic<[ llvm_anyint_ty ], + [ llvm_anyfloat_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_llrint : Intrinsic<[ llvm_anyint_ty ], + [ llvm_anyfloat_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; def int_experimental_constrained_maxnum : Intrinsic<[ llvm_anyfloat_ty ], [ LLVMMatchType<0>, LLVMMatchType<0>, @@ -694,6 +702,12 @@ [ LLVMMatchType<0>, llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_lround : Intrinsic<[ llvm_anyint_ty ], + [ llvm_anyfloat_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_llround : Intrinsic<[ llvm_anyint_ty ], + [ llvm_anyfloat_ty, + llvm_metadata_ty ]>; def int_experimental_constrained_round : Intrinsic<[ llvm_anyfloat_ty ], [ LLVMMatchType<0>, llvm_metadata_ty, Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -493,12 +493,20 @@ SDTFPUnaryOp, [SDNPHasChain]>; def strict_frint : SDNode<"ISD::STRICT_FRINT", SDTFPUnaryOp, [SDNPHasChain]>; +def strict_lrint : SDNode<"ISD::STRICT_LRINT", + SDTFPToIntOp, [SDNPHasChain]>; +def strict_llrint : SDNode<"ISD::STRICT_LLRINT", + SDTFPToIntOp, [SDNPHasChain]>; def strict_fnearbyint : SDNode<"ISD::STRICT_FNEARBYINT", SDTFPUnaryOp, [SDNPHasChain]>; def strict_fceil : SDNode<"ISD::STRICT_FCEIL", SDTFPUnaryOp, [SDNPHasChain]>; def strict_ffloor : SDNode<"ISD::STRICT_FFLOOR", SDTFPUnaryOp, [SDNPHasChain]>; +def strict_lround : SDNode<"ISD::STRICT_LROUND", + SDTFPToIntOp, [SDNPHasChain]>; +def strict_llround : SDNode<"ISD::STRICT_LLROUND", + SDTFPToIntOp, [SDNPHasChain]>; def strict_fround : SDNode<"ISD::STRICT_FROUND", SDTFPUnaryOp, [SDNPHasChain]>; def strict_ftrunc : SDNode<"ISD::STRICT_FTRUNC", @@ -1271,6 +1279,12 @@ def any_frint : PatFrags<(ops node:$src), [(strict_frint node:$src), (frint node:$src)]>; +def any_lrint : PatFrags<(ops node:$src), + [(strict_lrint node:$src), + (lrint node:$src)]>; +def any_llrint : PatFrags<(ops node:$src), + [(strict_llrint node:$src), + (llrint node:$src)]>; def any_fnearbyint : PatFrags<(ops node:$src), [(strict_fnearbyint node:$src), (fnearbyint node:$src)]>; @@ -1280,6 +1294,12 @@ def any_ffloor : PatFrags<(ops node:$src), [(strict_ffloor node:$src), (ffloor node:$src)]>; +def any_lround : PatFrags<(ops node:$src), + [(strict_lround node:$src), + (lround node:$src)]>; +def any_llround : PatFrags<(ops node:$src), + [(strict_llround node:$src), + (llround node:$src)]>; def any_fround : PatFrags<(ops node:$src), [(strict_fround node:$src), (fround node:$src)]>; Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -205,7 +205,7 @@ LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG); dbgs() << " with: "; New->dump(&DAG)); - assert(Old->getNumValues() == New->getNumValues() && + assert(Old->getNumValues() <= New->getNumValues() && "Replacing one node with another that produces a different number " "of values!"); DAG.ReplaceAllUsesWith(Old, New); @@ -1130,6 +1130,16 @@ Action = TLI.getStrictFPOperationAction(Node->getOpcode(), Node->getValueType(0)); break; + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + // These pseudo-ops are the same as the other STRICT_ ops except + // they are registered with setOperationAction() using the input type + // instead of the output type. + Action = TLI.getStrictFPOperationAction(Node->getOpcode(), + Node->getOperand(1).getValueType()); + break; case ISD::SADDSAT: case ISD::UADDSAT: case ISD::SSUBSAT: @@ -2027,10 +2037,15 @@ // and leave the Hi part unset. SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned) { + SDValue CurInChain; TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; for (const SDValue &Op : Node->op_values()) { EVT ArgVT = Op.getValueType(); + if (ArgVT.isSimple() && ArgVT.getSimpleVT() == MVT::Other) { + CurInChain = Op; + continue; + } Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op; Entry.Ty = ArgTy; @@ -2048,7 +2063,11 @@ // function. If the libcall is going to be emitted as a tail call then // TLI.isUsedByReturnOnly will change it to the right chain if the return // node which is being folded has a non-entry input chain. - SDValue InChain = DAG.getEntryNode(); + SDValue InChain; + if (Node->isStrictFPOpcode()) + InChain = CurInChain; + else + InChain = DAG.getEntryNode(); // isTailCall may be true since the callee does not reference caller stack // frame. Check if it's in the right position and that the return types match. @@ -2059,6 +2078,8 @@ (RetTy == F.getReturnType() || F.getReturnType()->isVoidTy()); if (isTailCall) InChain = TCChain; + assert(!(isTailCall && Node->isStrictFPOpcode()) && + "Constrained FP tail calls are untested."); TargetLowering::CallLoweringInfo CLI(DAG); bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, isSigned); @@ -2168,7 +2189,8 @@ RTLIB::Libcall Call_F128, RTLIB::Libcall Call_PPCF128) { RTLIB::Libcall LC; - switch (Node->getOperand(0).getValueType().getSimpleVT().SimpleTy) { + unsigned OpNum = Node->isStrictFPOpcode() ? 1 : 0; + switch (Node->getOperand(OpNum).getValueType().getSimpleVT().SimpleTy) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::f32: LC = Call_F32; break; case MVT::f64: LC = Call_F64; break; @@ -2911,24 +2933,60 @@ RTLIB::LROUND_F128, RTLIB::LROUND_PPCF128)); break; + case ISD::STRICT_LROUND: + Tmp1 = ExpandArgFPLibCall(Node, RTLIB::LROUND_F32, + RTLIB::LROUND_F64, RTLIB::LROUND_F80, + RTLIB::LROUND_F128, + RTLIB::LROUND_PPCF128); + ReplaceNode(Node, Tmp1.getNode()); + LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_LROUND node\n"); + return true; + break; case ISD::LLROUND: Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLROUND_F32, RTLIB::LLROUND_F64, RTLIB::LLROUND_F80, RTLIB::LLROUND_F128, RTLIB::LLROUND_PPCF128)); break; + case ISD::STRICT_LLROUND: + Tmp1 = ExpandArgFPLibCall(Node, RTLIB::LLROUND_F32, + RTLIB::LLROUND_F64, RTLIB::LLROUND_F80, + RTLIB::LLROUND_F128, + RTLIB::LLROUND_PPCF128); + ReplaceNode(Node, Tmp1.getNode()); + LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_LLROUND node\n"); + return true; + break; case ISD::LRINT: Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LRINT_F32, RTLIB::LRINT_F64, RTLIB::LRINT_F80, RTLIB::LRINT_F128, RTLIB::LRINT_PPCF128)); break; + case ISD::STRICT_LRINT: + Tmp1 = ExpandArgFPLibCall(Node, RTLIB::LRINT_F32, + RTLIB::LRINT_F64, RTLIB::LRINT_F80, + RTLIB::LRINT_F128, + RTLIB::LRINT_PPCF128); + ReplaceNode(Node, Tmp1.getNode()); + LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_LRINT node\n"); + return true; + break; case ISD::LLRINT: Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLRINT_F32, RTLIB::LLRINT_F64, RTLIB::LLRINT_F80, RTLIB::LLRINT_F128, RTLIB::LLRINT_PPCF128)); break; + case ISD::STRICT_LLRINT: + Tmp1 = ExpandArgFPLibCall(Node, RTLIB::LLRINT_F32, + RTLIB::LLRINT_F64, RTLIB::LLRINT_F80, + RTLIB::LLRINT_F128, + RTLIB::LLRINT_PPCF128); + ReplaceNode(Node, Tmp1.getNode()); + LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_LLRINT node\n"); + return true; + break; case ISD::VAARG: Results.push_back(DAG.expandVAArg(Node)); Results.push_back(Results[0].getValue(1)); @@ -4565,6 +4623,7 @@ continue; } + LLVM_DEBUG(dbgs() << "\nExamining: "; N->dump(this)); if (LegalizedNodes.insert(N).second) { AnyLegalized = true; Legalizer.LegalizeOp(N); Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -115,6 +115,11 @@ case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: Res = PromoteIntRes_FP_TO_XINT(N); break; + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: Res = PromoteIntRes_CHAINED(N); break; + case ISD::FP_TO_FP16: Res = PromoteIntRes_FP_TO_FP16(N); break; case ISD::FLT_ROUNDS_: Res = PromoteIntRes_FLT_ROUNDS(N); break; @@ -515,6 +520,23 @@ return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); } +SDValue DAGTypeLegalizer::PromoteIntRes_CHAINED(SDNode *N) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + unsigned NewOpc = N->getOpcode(); + SmallVector Opers; + SDLoc dl(N); + + for (unsigned i = 0; i < N->getNumOperands(); ++i) + Opers.push_back(N->getOperand(i)); + + SDValue Result = DAG.getNode(NewOpc, dl, { NVT, MVT::Other }, Opers); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Result.getValue(1)); + return Result; +} + SDValue DAGTypeLegalizer::PromoteIntRes_FLT_ROUNDS(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -320,6 +320,7 @@ SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N); SDValue PromoteIntRes_FP_TO_XINT(SDNode *N); SDValue PromoteIntRes_FP_TO_FP16(SDNode *N); + SDValue PromoteIntRes_CHAINED(SDNode *N); SDValue PromoteIntRes_INT_EXTEND(SDNode *N); SDValue PromoteIntRes_LOAD(LoadSDNode *N); SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N); Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -333,6 +333,10 @@ case ISD::STRICT_FFLOOR: case ISD::STRICT_FROUND: case ISD::STRICT_FTRUNC: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_EXTEND: // These pseudo-ops get legalized as if they were their non-strict @@ -844,6 +848,10 @@ case ISD::STRICT_FFLOOR: case ISD::STRICT_FROUND: case ISD::STRICT_FTRUNC: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: return ExpandStrictFPOp(Op); case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_MUL: Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -171,6 +171,10 @@ case ISD::STRICT_FFLOOR: case ISD::STRICT_FROUND: case ISD::STRICT_FTRUNC: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: case ISD::STRICT_FP_EXTEND: R = ScalarizeVecRes_StrictFPOp(N); break; @@ -964,6 +968,10 @@ case ISD::STRICT_FFLOOR: case ISD::STRICT_FROUND: case ISD::STRICT_FTRUNC: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: SplitVecRes_StrictFPOp(N, Lo, Hi); break; case ISD::UADDO: @@ -1989,6 +1997,14 @@ case ISD::ANY_EXTEND: case ISD::FTRUNC: case ISD::FCANONICALIZE: + case ISD::LRINT: + case ISD::STRICT_LRINT: + case ISD::LLRINT: + case ISD::STRICT_LLRINT: + case ISD::LROUND: + case ISD::STRICT_LROUND: + case ISD::LLROUND: + case ISD::STRICT_LLROUND: Res = SplitVecOp_UnaryOp(N); break; @@ -2790,6 +2806,10 @@ case ISD::STRICT_FP_EXTEND: case ISD::STRICT_FP_ROUND: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: Res = WidenVecRes_Convert_StrictFP(N); break; @@ -4098,6 +4118,10 @@ case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::TRUNCATE: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: Res = WidenVecOp_Convert(N); break; Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7766,12 +7766,16 @@ case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; break; case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; break; case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; break; + case ISD::STRICT_LRINT: NewOpc = ISD::LRINT; break; + case ISD::STRICT_LLRINT: NewOpc = ISD::LLRINT; break; case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; break; case ISD::STRICT_FNEARBYINT: NewOpc = ISD::FNEARBYINT; break; case ISD::STRICT_FMAXNUM: NewOpc = ISD::FMAXNUM; break; case ISD::STRICT_FMINNUM: NewOpc = ISD::FMINNUM; break; case ISD::STRICT_FCEIL: NewOpc = ISD::FCEIL; break; case ISD::STRICT_FFLOOR: NewOpc = ISD::FFLOOR; break; + case ISD::STRICT_LROUND: NewOpc = ISD::LROUND; break; + case ISD::STRICT_LLROUND: NewOpc = ISD::LLROUND; break; case ISD::STRICT_FROUND: NewOpc = ISD::FROUND; break; case ISD::STRICT_FTRUNC: NewOpc = ISD::FTRUNC; break; case ISD::STRICT_FP_ROUND: NewOpc = ISD::FP_ROUND; break; Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6075,12 +6075,16 @@ case Intrinsic::experimental_constrained_log: case Intrinsic::experimental_constrained_log10: case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_lrint: + case Intrinsic::experimental_constrained_llrint: case Intrinsic::experimental_constrained_rint: case Intrinsic::experimental_constrained_nearbyint: case Intrinsic::experimental_constrained_maxnum: case Intrinsic::experimental_constrained_minnum: case Intrinsic::experimental_constrained_ceil: case Intrinsic::experimental_constrained_floor: + case Intrinsic::experimental_constrained_lround: + case Intrinsic::experimental_constrained_llround: case Intrinsic::experimental_constrained_round: case Intrinsic::experimental_constrained_trunc: visitConstrainedFPIntrinsic(cast(I)); @@ -6868,6 +6872,12 @@ case Intrinsic::experimental_constrained_log2: Opcode = ISD::STRICT_FLOG2; break; + case Intrinsic::experimental_constrained_lrint: + Opcode = ISD::STRICT_LRINT; + break; + case Intrinsic::experimental_constrained_llrint: + Opcode = ISD::STRICT_LLRINT; + break; case Intrinsic::experimental_constrained_rint: Opcode = ISD::STRICT_FRINT; break; @@ -6886,6 +6896,12 @@ case Intrinsic::experimental_constrained_floor: Opcode = ISD::STRICT_FFLOOR; break; + case Intrinsic::experimental_constrained_lround: + Opcode = ISD::STRICT_LROUND; + break; + case Intrinsic::experimental_constrained_llround: + Opcode = ISD::STRICT_LLROUND; + break; case Intrinsic::experimental_constrained_round: Opcode = ISD::STRICT_FROUND; break; Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -331,9 +331,13 @@ case ISD::FP16_TO_FP: return "fp16_to_fp"; case ISD::FP_TO_FP16: return "fp_to_fp16"; case ISD::LROUND: return "lround"; + case ISD::STRICT_LROUND: return "strict_lround"; case ISD::LLROUND: return "llround"; + case ISD::STRICT_LLROUND: return "strict_llround"; case ISD::LRINT: return "lrint"; + case ISD::STRICT_LRINT: return "strict_lrint"; case ISD::LLRINT: return "llrint"; + case ISD::STRICT_LLRINT: return "strict_llrint"; // Control flow instructions case ISD::BR: return "br"; Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -680,10 +680,14 @@ setOperationAction(ISD::STRICT_FLOG, VT, Expand); setOperationAction(ISD::STRICT_FLOG10, VT, Expand); setOperationAction(ISD::STRICT_FLOG2, VT, Expand); + setOperationAction(ISD::STRICT_LRINT, VT, Expand); + setOperationAction(ISD::STRICT_LLRINT, VT, Expand); setOperationAction(ISD::STRICT_FRINT, VT, Expand); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Expand); setOperationAction(ISD::STRICT_FCEIL, VT, Expand); setOperationAction(ISD::STRICT_FFLOOR, VT, Expand); + setOperationAction(ISD::STRICT_LROUND, VT, Expand); + setOperationAction(ISD::STRICT_LLROUND, VT, Expand); setOperationAction(ISD::STRICT_FROUND, VT, Expand); setOperationAction(ISD::STRICT_FTRUNC, VT, Expand); setOperationAction(ISD::STRICT_FMAXNUM, VT, Expand); @@ -745,6 +749,20 @@ setOperationAction(ISD::LLRINT, VT, Expand); } + // These are likely to be library calls so vectors need to be unrolled. + // All types of vector need to be marked since sometimes we check using + // the floating point type and other times we check on the result which + // is an integer vector. + for (unsigned I = MVT::FIRST_VECTOR_VALUETYPE; + I <= MVT::LAST_VECTOR_VALUETYPE; + ++I) { + MVT VT = MVT::SimpleValueType(I); + setOperationAction(ISD::LROUND, VT, Expand); + setOperationAction(ISD::LLROUND, VT, Expand); + setOperationAction(ISD::LRINT, VT, Expand); + setOperationAction(ISD::LLRINT, VT, Expand); + } + // Default ISD::TRAP to expand (which turns it into abort). setOperationAction(ISD::TRAP, MVT::Other, Expand); Index: lib/IR/IntrinsicInst.cpp =================================================================== --- lib/IR/IntrinsicInst.cpp +++ lib/IR/IntrinsicInst.cpp @@ -199,10 +199,14 @@ case Intrinsic::experimental_constrained_log: case Intrinsic::experimental_constrained_log10: case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_lrint: + case Intrinsic::experimental_constrained_llrint: case Intrinsic::experimental_constrained_rint: case Intrinsic::experimental_constrained_nearbyint: case Intrinsic::experimental_constrained_ceil: case Intrinsic::experimental_constrained_floor: + case Intrinsic::experimental_constrained_lround: + case Intrinsic::experimental_constrained_llround: case Intrinsic::experimental_constrained_round: case Intrinsic::experimental_constrained_trunc: return true; Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -4247,12 +4247,16 @@ case Intrinsic::experimental_constrained_log: case Intrinsic::experimental_constrained_log10: case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_lrint: + case Intrinsic::experimental_constrained_llrint: case Intrinsic::experimental_constrained_rint: case Intrinsic::experimental_constrained_nearbyint: case Intrinsic::experimental_constrained_maxnum: case Intrinsic::experimental_constrained_minnum: case Intrinsic::experimental_constrained_ceil: case Intrinsic::experimental_constrained_floor: + case Intrinsic::experimental_constrained_lround: + case Intrinsic::experimental_constrained_llround: case Intrinsic::experimental_constrained_round: case Intrinsic::experimental_constrained_trunc: visitConstrainedFPIntrinsic(cast(Call)); @@ -4698,12 +4702,21 @@ case Intrinsic::experimental_constrained_floor: case Intrinsic::experimental_constrained_round: case Intrinsic::experimental_constrained_trunc: + case Intrinsic::experimental_constrained_lrint: + case Intrinsic::experimental_constrained_llrint: Assert((NumOperands == 3), "invalid arguments for constrained FP intrinsic", &FPI); HasExceptionMD = true; HasRoundingMD = true; break; + case Intrinsic::experimental_constrained_lround: + case Intrinsic::experimental_constrained_llround: + Assert((NumOperands == 2), "invalid arguments for constrained FP intrinsic", + &FPI); + HasExceptionMD = true; + break; + case Intrinsic::experimental_constrained_fma: Assert((NumOperands == 5), "invalid arguments for constrained FP intrinsic", &FPI); Index: test/CodeGen/X86/fp-intrinsics.ll =================================================================== --- test/CodeGen/X86/fp-intrinsics.ll +++ test/CodeGen/X86/fp-intrinsics.ll @@ -309,6 +309,82 @@ ret double %result } +; CHECK-LABEL: f23 +; COMMON: callq lrint +define i32 @f23(double %x) { +entry: + %result = call i32 @llvm.experimental.constrained.lrint.i32.f64(double %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret i32 %result +} + +; CHECK-LABEL: f24 +; COMMON: callq lrintf +define i32 @f24(float %x) { +entry: + %result = call i32 @llvm.experimental.constrained.lrint.i32.f32(float %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret i32 %result +} + +; CHECK-LABEL: f25 +; COMMON: callq llrint +define i64 @f25(double %x) { +entry: + %result = call i64 @llvm.experimental.constrained.llrint.i64.f64(double %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret i64 %result +} + +; CHECK-LABEL: f26 +; COMMON: callq llrintf +define i64 @f26(float %x) { +entry: + %result = call i64 @llvm.experimental.constrained.llrint.i64.f32(float %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret i64 %result +} + +; CHECK-LABEL: f27 +; COMMON: callq lround +define i32 @f27(double %x) { +entry: + %result = call i32 @llvm.experimental.constrained.lround.i32.f64(double %x, + metadata !"fpexcept.strict") + ret i32 %result +} + +; CHECK-LABEL: f28 +; COMMON: callq lroundf +define i32 @f28(float %x) { +entry: + %result = call i32 @llvm.experimental.constrained.lround.i32.f32(float %x, + metadata !"fpexcept.strict") + ret i32 %result +} + +; CHECK-LABEL: f29 +; COMMON: callq llround +define i64 @f29(double %x) { +entry: + %result = call i64 @llvm.experimental.constrained.llround.i64.f64(double %x, + metadata !"fpexcept.strict") + ret i64 %result +} + +; CHECK-LABEL: f30 +; COMMON: callq llroundf +define i64 @f30(float %x) { +entry: + %result = call i64 @llvm.experimental.constrained.llround.i64.f32(float %x, + metadata !"fpexcept.strict") + ret i64 %result +} + @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata) @@ -331,4 +407,12 @@ declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) +declare i32 @llvm.experimental.constrained.lrint.i32.f64(double, metadata, metadata) +declare i32 @llvm.experimental.constrained.lrint.i32.f32(float, metadata, metadata) +declare i64 @llvm.experimental.constrained.llrint.i64.f64(double, metadata, metadata) +declare i64 @llvm.experimental.constrained.llrint.i64.f32(float, metadata, metadata) +declare i32 @llvm.experimental.constrained.lround.i32.f64(double, metadata) +declare i32 @llvm.experimental.constrained.lround.i32.f32(float, metadata) +declare i64 @llvm.experimental.constrained.llround.i64.f64(double, metadata) +declare i64 @llvm.experimental.constrained.llround.i64.f32(float, metadata) Index: test/CodeGen/X86/vector-constrained-fp-intrinsics.ll =================================================================== --- test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -4603,7 +4603,1515 @@ ret <3 x double> %trunc } +define <1 x i32> @constrained_vector_lrint_v1f32() { +; CHECK-LABEL: constrained_vector_lrint_v1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lrintf +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lrint_v1f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lrintf +; AVX-NEXT: popq %rcx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <1 x i32> @llvm.experimental.constrained.lrint.v1i32.v1f32( + <1 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <1 x i32> %result +} +define <2 x i32> @constrained_vector_lrint_v2f32() { +; CHECK-LABEL: constrained_vector_lrint_v2f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lrintf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lrintf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lrint_v2f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lrintf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lrintf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <2 x i32> @llvm.experimental.constrained.lrint.v2i32.v2f32( + <2 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <2 x i32> %result +} + +define <3 x i32> @constrained_vector_lrint_v3f32() { +; CHECK-LABEL: constrained_vector_lrint_v3f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lrintf +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lrintf +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lrintf +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lrint_v3f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: .cfi_offset %rbx, -16 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lrintf +; AVX-NEXT: movl %eax, %ebx +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lrintf +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lrintf +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: popq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <3 x i32> @llvm.experimental.constrained.lrint.v3i32.v3f32( + <3 x float>, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <3 x i32> %result +} + +define <4 x i32> @constrained_vector_lrint_v4f32() { +; CHECK-LABEL: constrained_vector_lrint_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lrintf +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lrintf +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lrintf +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lrintf +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lrint_v4f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: .cfi_offset %rbx, -16 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lrintf +; AVX-NEXT: movl %eax, %ebx +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lrintf +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lrintf +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lrintf +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: popq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <4 x i32> @llvm.experimental.constrained.lrint.v4i32.v4f32( + <4 x float>, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x i32> %result +} + +define <1 x i64> @constrained_vector_llrint_v1f32() { +; CHECK-LABEL: constrained_vector_llrint_v1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llrintf +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llrint_v1f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llrintf +; AVX-NEXT: popq %rcx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <1 x i64> @llvm.experimental.constrained.llrint.v1i64.v1f32( + <1 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <1 x i64> %result +} + +define <2 x i64> @constrained_vector_llrint_v2f32() { +; CHECK-LABEL: constrained_vector_llrint_v2f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llrintf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llrintf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llrint_v2f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llrintf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llrintf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <2 x i64> @llvm.experimental.constrained.llrint.v2i32.v2f32( + <2 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <2 x i64> %result +} + +define <3 x i64> @constrained_vector_llrint_v3f32() { +; CHECK-LABEL: constrained_vector_llrint_v3f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llrintf +; CHECK-NEXT: movq %rax, %r14 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llrintf +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llrintf +; CHECK-NEXT: movq %rbx, %rdx +; CHECK-NEXT: movq %r14, %rcx +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llrint_v3f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $56, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llrintf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llrintf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq llrintf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <3 x i64> @llvm.experimental.constrained.llrint.v3i64.v3f32( + <3 x float>, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <3 x i64> %result +} + +define <4 x i64> @constrained_vector_llrint_v4f32() { +; CHECK-LABEL: constrained_vector_llrint_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llrintf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llrintf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llrintf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llrintf +; CHECK-NEXT: movq %rax, %xmm1 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llrint_v4f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llrintf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llrintf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llrintf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llrintf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <4 x i64> @llvm.experimental.constrained.llrint.v4i64.v4f32( + <4 x float>, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x i64> %result +} + +define <1 x i32> @constrained_vector_lrint_v1f64() { +; CHECK-LABEL: constrained_vector_lrint_v1f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lrint +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lrint_v1f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lrint +; AVX-NEXT: popq %rcx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <1 x i32> @llvm.experimental.constrained.lrint.v1i32.v1f64( + <1 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <1 x i32> %result +} + +define <2 x i32> @constrained_vector_lrint_v2f64() { +; CHECK-LABEL: constrained_vector_lrint_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lrint +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lrint_v2f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lrint +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: popq %rax +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <2 x i32> @llvm.experimental.constrained.lrint.v2i32.v2f64( + <2 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <2 x i32> %result +} + +define <3 x i32> @constrained_vector_lrint_v3f64() { +; CHECK-LABEL: constrained_vector_lrint_v3f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lrint +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lrint +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lrint +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lrint_v3f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: .cfi_offset %rbx, -16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lrint +; AVX-NEXT: movl %eax, %ebx +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lrint +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lrint +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: popq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <3 x i32> @llvm.experimental.constrained.lrint.v3i32.v3f64( + <3 x double>, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <3 x i32> %result +} + +define <4 x i32> @constrained_vector_lrint_v4f64() { +; CHECK-LABEL: constrained_vector_lrint_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lrint +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lrint +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lrint +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lrint +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: shufps $136, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0,2],mem[0,2] +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lrint_v4f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: .cfi_offset %rbx, -16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lrint +; AVX-NEXT: movl %eax, %ebx +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lrint +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lrint +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lrint +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: popq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <4 x i32> @llvm.experimental.constrained.lrint.v4i32.v4f64( + <4 x double>, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x i32> %result +} + +define <1 x i64> @constrained_vector_llrint_v1f64() { +; CHECK-LABEL: constrained_vector_llrint_v1f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llrint +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llrint_v1f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llrint +; AVX-NEXT: popq %rcx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <1 x i64> @llvm.experimental.constrained.llrint.v1i64.v1f64( + <1 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <1 x i64> %result +} + +define <2 x i64> @constrained_vector_llrint_v2f64() { +; CHECK-LABEL: constrained_vector_llrint_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llrint +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llrint_v2f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llrint +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: popq %rax +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <2 x i64> @llvm.experimental.constrained.llrint.v2i32.v2f64( + <2 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <2 x i64> %result +} + +define <3 x i64> @constrained_vector_llrint_v3f64() { +; CHECK-LABEL: constrained_vector_llrint_v3f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llrint +; CHECK-NEXT: movq %rax, %r14 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llrint +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llrint +; CHECK-NEXT: movq %rbx, %rdx +; CHECK-NEXT: movq %r14, %rcx +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llrint_v3f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $56, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llrint +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llrint +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq llrint +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <3 x i64> @llvm.experimental.constrained.llrint.v3i64.v3f64( + <3 x double>, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <3 x i64> %result +} + +define <4 x i64> @constrained_vector_llrint_v4f64() { +; CHECK-LABEL: constrained_vector_llrint_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llrint +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llrint +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llrint +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llrint +; CHECK-NEXT: movq %rax, %xmm1 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llrint_v4f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llrint +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llrint +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llrint +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llrint +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <4 x i64> @llvm.experimental.constrained.llrint.v4i64.v4f64( + <4 x double>, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x i64> %result +} + +define <1 x i32> @constrained_vector_lround_v1f32() { +; CHECK-LABEL: constrained_vector_lround_v1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lroundf +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lround_v1f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lroundf +; AVX-NEXT: popq %rcx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <1 x i32> @llvm.experimental.constrained.lround.v1i32.v1f32( + <1 x float> , + metadata !"fpexcept.strict") + ret <1 x i32> %result +} + +define <2 x i32> @constrained_vector_lround_v2f32() { +; CHECK-LABEL: constrained_vector_lround_v2f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lroundf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lroundf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lround_v2f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lroundf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lroundf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <2 x i32> @llvm.experimental.constrained.lround.v2i32.v2f32( + <2 x float> , + metadata !"fpexcept.strict") + ret <2 x i32> %result +} + +define <3 x i32> @constrained_vector_lround_v3f32() { +; CHECK-LABEL: constrained_vector_lround_v3f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lroundf +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lroundf +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lroundf +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lround_v3f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: .cfi_offset %rbx, -16 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lroundf +; AVX-NEXT: movl %eax, %ebx +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lroundf +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lroundf +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: popq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <3 x i32> @llvm.experimental.constrained.lround.v3i32.v3f32( + <3 x float>, + metadata !"fpexcept.strict") + ret <3 x i32> %result +} + +define <4 x i32> @constrained_vector_lround_v4f32() { +; CHECK-LABEL: constrained_vector_lround_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lroundf +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lroundf +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lroundf +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq lroundf +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lround_v4f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: .cfi_offset %rbx, -16 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lroundf +; AVX-NEXT: movl %eax, %ebx +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lroundf +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lroundf +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq lroundf +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: popq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <4 x i32> @llvm.experimental.constrained.lround.v4i32.v4f32( + <4 x float>, + metadata !"fpexcept.strict") + ret <4 x i32> %result +} + +define <1 x i64> @constrained_vector_llround_v1f32() { +; CHECK-LABEL: constrained_vector_llround_v1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llroundf +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llround_v1f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llroundf +; AVX-NEXT: popq %rcx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <1 x i64> @llvm.experimental.constrained.llround.v1i64.v1f32( + <1 x float> , + metadata !"fpexcept.strict") + ret <1 x i64> %result +} + +define <2 x i64> @constrained_vector_llround_v2f32() { +; CHECK-LABEL: constrained_vector_llround_v2f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llroundf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llroundf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llround_v2f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llroundf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llroundf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <2 x i64> @llvm.experimental.constrained.llround.v2i32.v2f32( + <2 x float> , + metadata !"fpexcept.strict") + ret <2 x i64> %result +} + +define <3 x i64> @constrained_vector_llround_v3f32() { +; CHECK-LABEL: constrained_vector_llround_v3f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llroundf +; CHECK-NEXT: movq %rax, %r14 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llroundf +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llroundf +; CHECK-NEXT: movq %rbx, %rdx +; CHECK-NEXT: movq %r14, %rcx +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llround_v3f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $56, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llroundf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llroundf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq llroundf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <3 x i64> @llvm.experimental.constrained.llround.v3i64.v3f32( + <3 x float>, + metadata !"fpexcept.strict") + ret <3 x i64> %result +} + +define <4 x i64> @constrained_vector_llround_v4f32() { +; CHECK-LABEL: constrained_vector_llround_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llroundf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llroundf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llroundf +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq llroundf +; CHECK-NEXT: movq %rax, %xmm1 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llround_v4f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llroundf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llroundf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llroundf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq llroundf +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <4 x i64> @llvm.experimental.constrained.llround.v4i64.v4f32( + <4 x float>, + metadata !"fpexcept.strict") + ret <4 x i64> %result +} + + +define <1 x i32> @constrained_vector_lround_v1f64() { +; CHECK-LABEL: constrained_vector_lround_v1f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lround +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lround_v1f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lround +; AVX-NEXT: popq %rcx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <1 x i32> @llvm.experimental.constrained.lround.v1i32.v1f64( + <1 x double> , + metadata !"fpexcept.strict") + ret <1 x i32> %result +} + +define <2 x i32> @constrained_vector_lround_v2f64() { +; CHECK-LABEL: constrained_vector_lround_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lround +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lround_v2f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lround +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: popq %rax +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <2 x i32> @llvm.experimental.constrained.lround.v2i32.v2f64( + <2 x double> , + metadata !"fpexcept.strict") + ret <2 x i32> %result +} + +define <3 x i32> @constrained_vector_lround_v3f64() { +; CHECK-LABEL: constrained_vector_lround_v3f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lround +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lround +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lround +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lround_v3f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: .cfi_offset %rbx, -16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lround +; AVX-NEXT: movl %eax, %ebx +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lround +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lround +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: popq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <3 x i32> @llvm.experimental.constrained.lround.v3i32.v3f64( + <3 x double>, + metadata !"fpexcept.strict") + ret <3 x i32> %result +} + +define <4 x i32> @constrained_vector_lround_v4f64() { +; CHECK-LABEL: constrained_vector_lround_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lround +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lround +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lround +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq lround +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: shufps $136, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0,2],mem[0,2] +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_lround_v4f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: .cfi_offset %rbx, -16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lround +; AVX-NEXT: movl %eax, %ebx +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lround +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lround +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq lround +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: popq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <4 x i32> @llvm.experimental.constrained.lround.v4i32.v4f64( + <4 x double>, + metadata !"fpexcept.strict") + ret <4 x i32> %result +} + +define <1 x i64> @constrained_vector_llround_v1f64() { +; CHECK-LABEL: constrained_vector_llround_v1f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llround +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llround_v1f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llround +; AVX-NEXT: popq %rcx +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <1 x i64> @llvm.experimental.constrained.llround.v1i64.v1f64( + <1 x double> , + metadata !"fpexcept.strict") + ret <1 x i64> %result +} + +define <2 x i64> @constrained_vector_llround_v2f64() { +; CHECK-LABEL: constrained_vector_llround_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llround +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llround_v2f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llround +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: popq %rax +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <2 x i64> @llvm.experimental.constrained.llround.v2i32.v2f64( + <2 x double> , + metadata !"fpexcept.strict") + ret <2 x i64> %result +} + +define <3 x i64> @constrained_vector_llround_v3f64() { +; CHECK-LABEL: constrained_vector_llround_v3f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llround +; CHECK-NEXT: movq %rax, %r14 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llround +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llround +; CHECK-NEXT: movq %rbx, %rdx +; CHECK-NEXT: movq %r14, %rcx +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llround_v3f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $56, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llround +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llround +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq llround +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <3 x i64> @llvm.experimental.constrained.llround.v3i64.v3f64( + <3 x double>, + metadata !"fpexcept.strict") + ret <3 x i64> %result +} + +define <4 x i64> @constrained_vector_llround_v4f64() { +; CHECK-LABEL: constrained_vector_llround_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llround +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llround +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llround +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: callq llround +; CHECK-NEXT: movq %rax, %xmm1 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_llround_v4f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llround +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llround +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llround +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq llround +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call <4 x i64> @llvm.experimental.constrained.llround.v4i64.v4f64( + <4 x double>, + metadata !"fpexcept.strict") + ret <4 x i64> %result +} + ; Single width declarations declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 x double>, metadata, metadata) @@ -4630,6 +6138,14 @@ declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, metadata, metadata) +declare <2 x i32> @llvm.experimental.constrained.lrint.v2i32.v2f32(<2 x float>, metadata, metadata) +declare <2 x i64> @llvm.experimental.constrained.llrint.v2i32.v2f32(<2 x float>, metadata, metadata) +declare <2 x i32> @llvm.experimental.constrained.lrint.v2i32.v2f64(<2 x double>, metadata, metadata) +declare <2 x i64> @llvm.experimental.constrained.llrint.v2i32.v2f64(<2 x double>, metadata, metadata) +declare <2 x i32> @llvm.experimental.constrained.lround.v2i32.v2f32(<2 x float>, metadata) +declare <2 x i64> @llvm.experimental.constrained.llround.v2i32.v2f32(<2 x float>, metadata) +declare <2 x i32> @llvm.experimental.constrained.lround.v2i32.v2f64(<2 x double>, metadata) +declare <2 x i64> @llvm.experimental.constrained.llround.v2i32.v2f64(<2 x double>, metadata) ; Scalar width declarations declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata) @@ -4657,6 +6173,14 @@ declare <1 x float> @llvm.experimental.constrained.floor.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.round.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.trunc.v1f32(<1 x float>, metadata, metadata) +declare <1 x i32> @llvm.experimental.constrained.lrint.v1i32.v1f32(<1 x float>, metadata, metadata) +declare <1 x i64> @llvm.experimental.constrained.llrint.v1i64.v1f32(<1 x float>, metadata, metadata) +declare <1 x i32> @llvm.experimental.constrained.lround.v1i32.v1f32(<1 x float>, metadata) +declare <1 x i64> @llvm.experimental.constrained.llround.v1i64.v1f32(<1 x float>, metadata) +declare <1 x i32> @llvm.experimental.constrained.lrint.v1i32.v1f64(<1 x double>, metadata, metadata) +declare <1 x i64> @llvm.experimental.constrained.llrint.v1i64.v1f64(<1 x double>, metadata, metadata) +declare <1 x i32> @llvm.experimental.constrained.lround.v1i32.v1f64(<1 x double>, metadata) +declare <1 x i64> @llvm.experimental.constrained.llround.v1i64.v1f64(<1 x double>, metadata) ; Illegal width declarations declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata) @@ -4707,6 +6231,14 @@ declare <3 x double> @llvm.experimental.constrained.round.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.trunc.v3f32(<3 x float>, metadata, metadata) declare <3 x double> @llvm.experimental.constrained.trunc.v3f64(<3 x double>, metadata, metadata) +declare <3 x i32> @llvm.experimental.constrained.lrint.v3i32.v3f32(<3 x float>, metadata, metadata) +declare <3 x i64> @llvm.experimental.constrained.llrint.v3i64.v3f32(<3 x float>, metadata, metadata) +declare <3 x i32> @llvm.experimental.constrained.lrint.v3i32.v3f64(<3 x double>, metadata, metadata) +declare <3 x i64> @llvm.experimental.constrained.llrint.v3i64.v3f64(<3 x double>, metadata, metadata) +declare <3 x i32> @llvm.experimental.constrained.lround.v3i32.v3f32(<3 x float>, metadata) +declare <3 x i64> @llvm.experimental.constrained.llround.v3i64.v3f32(<3 x float>, metadata) +declare <3 x i32> @llvm.experimental.constrained.lround.v3i32.v3f64(<3 x double>, metadata) +declare <3 x i64> @llvm.experimental.constrained.llround.v3i64.v3f64(<3 x double>, metadata) ; Double width declarations declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata) @@ -4734,3 +6266,11 @@ declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double>, metadata, metadata) +declare <4 x i32> @llvm.experimental.constrained.lrint.v4i32.v4f32(<4 x float>, metadata, metadata) +declare <4 x i64> @llvm.experimental.constrained.llrint.v4i64.v4f32(<4 x float>, metadata, metadata) +declare <4 x i32> @llvm.experimental.constrained.lrint.v4i32.v4f64(<4 x double>, metadata, metadata) +declare <4 x i64> @llvm.experimental.constrained.llrint.v4i64.v4f64(<4 x double>, metadata, metadata) +declare <4 x i32> @llvm.experimental.constrained.lround.v4i32.v4f32(<4 x float>, metadata) +declare <4 x i64> @llvm.experimental.constrained.llround.v4i64.v4f32(<4 x float>, metadata) +declare <4 x i32> @llvm.experimental.constrained.lround.v4i32.v4f64(<4 x double>, metadata) +declare <4 x i64> @llvm.experimental.constrained.llround.v4i64.v4f64(<4 x double>, metadata) Index: test/Feature/fp-intrinsics.ll =================================================================== --- test/Feature/fp-intrinsics.ll +++ test/Feature/fp-intrinsics.ll @@ -266,6 +266,90 @@ ret double %result } +; Verify that lrint(42.1) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f22 +; CHECK: call i32 @llvm.experimental.constrained.lrint +define i32 @f22() { +entry: + %result = call i32 @llvm.experimental.constrained.lrint.i32.f64(double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret i32 %result +} + +; Verify that lrintf(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f23 +; CHECK: call i32 @llvm.experimental.constrained.lrint +define i32 @f23() { +entry: + %result = call i32 @llvm.experimental.constrained.lrint.i32.f32(float 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret i32 %result +} + +; Verify that llrint(42.1) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f24 +; CHECK: call i64 @llvm.experimental.constrained.llrint +define i64 @f24() { +entry: + %result = call i64 @llvm.experimental.constrained.llrint.i64.f64(double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret i64 %result +} + +; Verify that llrint(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f25 +; CHECK: call i64 @llvm.experimental.constrained.llrint +define i64 @f25() { +entry: + %result = call i64 @llvm.experimental.constrained.llrint.i64.f32(float 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret i64 %result +} + +; Verify that lround(42.1) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f26 +; CHECK: call i32 @llvm.experimental.constrained.lround +define i32 @f26() { +entry: + %result = call i32 @llvm.experimental.constrained.lround.i32.f64(double 42.1, + metadata !"fpexcept.strict") + ret i32 %result +} + +; Verify that lround(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f27 +; CHECK: call i32 @llvm.experimental.constrained.lround +define i32 @f27() { +entry: + %result = call i32 @llvm.experimental.constrained.lround.i32.f32(float 42.0, + metadata !"fpexcept.strict") + ret i32 %result +} + +; Verify that llround(42.1) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f28 +; CHECK: call i64 @llvm.experimental.constrained.llround +define i64 @f28() { +entry: + %result = call i64 @llvm.experimental.constrained.llround.i64.f64(double 42.1, + metadata !"fpexcept.strict") + ret i64 %result +} + +; Verify that llround(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f29 +; CHECK: call i64 @llvm.experimental.constrained.llround +define i64 @f29() { +entry: + %result = call i64 @llvm.experimental.constrained.llround.i64.f32(float 42.0, + metadata !"fpexcept.strict") + ret i64 %result +} + @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) @@ -286,3 +370,11 @@ declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) +declare i32 @llvm.experimental.constrained.lrint.i32.f64(double, metadata, metadata) +declare i32 @llvm.experimental.constrained.lrint.i32.f32(float, metadata, metadata) +declare i64 @llvm.experimental.constrained.llrint.i64.f64(double, metadata, metadata) +declare i64 @llvm.experimental.constrained.llrint.i64.f32(float, metadata, metadata) +declare i32 @llvm.experimental.constrained.lround.i32.f64(double, metadata) +declare i32 @llvm.experimental.constrained.lround.i32.f32(float, metadata) +declare i64 @llvm.experimental.constrained.llround.i64.f64(double, metadata) +declare i64 @llvm.experimental.constrained.llround.i64.f32(float, metadata)