Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -15552,6 +15552,78 @@ The result produced is a signed integer converted from the floating point operand. The value is truncated, so it is rounded towards zero. +'``llvm.experimental.constrained.uitofp``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.uitofp( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.uitofp``' intrinsic converts an +unsigned integer ``value`` to a floating-point of type ``ty2``. + +Arguments: +"""""""""" + +The first argument to the '``llvm.experimental.constrained.uitofp``' +intrinsic must be an :ref:`integer ` or :ref:`vector +` of integer values. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +An inexact floating-point exception will be raised if rounding is required. +Any result produced is a floating point value converted from the input +integer operand. + +'``llvm.experimental.constrained.sitofp``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.sitofp( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.sitofp``' intrinsic converts a +signed integer ``value`` to a floating-point of type ``ty2``. + +Arguments: +"""""""""" + +The first argument to the '``llvm.experimental.constrained.sitofp``' +intrinsic must be an :ref:`integer ` or :ref:`vector +` of integer values. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +An inexact floating-point exception will be raised if rounding is required. +Any result produced is a floating point value converted from the input +integer operand. + '``llvm.experimental.constrained.fptrunc``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: llvm/include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- llvm/include/llvm/CodeGen/ISDOpcodes.h +++ llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -310,6 +310,13 @@ STRICT_FP_TO_SINT, STRICT_FP_TO_UINT, + /// STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to + /// a floating point value. These have the same semantics as sitofp and + /// uitofp in IR. + /// They are used to limit optimizations while the DAG is being optimized. + STRICT_SINT_TO_FP, + STRICT_UINT_TO_FP, + /// X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating /// point type down to the precision of the destination VT. TRUNC is a /// flag, which is always an integer that is zero or one. If TRUNC is 0, Index: llvm/include/llvm/CodeGen/SelectionDAG.h =================================================================== --- llvm/include/llvm/CodeGen/SelectionDAG.h +++ llvm/include/llvm/CodeGen/SelectionDAG.h @@ -811,6 +811,11 @@ /// float type VT, by either extending or rounding (by truncation). SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT); + /// Convert Op, which must be a STRICT operation of float type, to the + /// float type VT, by either extending or rounding (by truncation). + std::pair + getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT); + /// Convert Op, which must be of integer type, to the /// integer type VT, by either any-extending or truncating it. SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT); Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -4123,14 +4123,18 @@ /// Expand float to UINT conversion /// \param N Node to expand /// \param Result output after conversion + /// \param Chain output chain after conversion /// \returns True, if the expansion was successful, false otherwise - bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SDValue &Chain, SelectionDAG &DAG) const; + bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SDValue &Chain, + SelectionDAG &DAG) const; /// Expand UINT(i64) to double(f64) conversion /// \param N Node to expand /// \param Result output after conversion + /// \param Chain output chain after conversion /// \returns True, if the expansion was successful, false otherwise - bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SDValue &Chain, + SelectionDAG &DAG) const; /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs. SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const; Index: llvm/include/llvm/IR/ConstrainedOps.def =================================================================== --- llvm/include/llvm/IR/ConstrainedOps.def +++ llvm/include/llvm/IR/ConstrainedOps.def @@ -36,6 +36,8 @@ INSTRUCTION(FDiv, 2, 1, experimental_constrained_fdiv, FDIV) INSTRUCTION(FRem, 2, 1, experimental_constrained_frem, FREM) INSTRUCTION(FPExt, 1, 0, experimental_constrained_fpext, FP_EXTEND) +INSTRUCTION(SIToFP, 1, 1, experimental_constrained_sitofp, SINT_TO_FP) +INSTRUCTION(UIToFP, 1, 1, experimental_constrained_uitofp, UINT_TO_FP) INSTRUCTION(FPToSI, 1, 0, experimental_constrained_fptosi, FP_TO_SINT) INSTRUCTION(FPToUI, 1, 0, experimental_constrained_fptoui, FP_TO_UINT) INSTRUCTION(FPTrunc, 1, 1, experimental_constrained_fptrunc, FP_ROUND) Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -640,6 +640,16 @@ [ llvm_anyfloat_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_sitofp : Intrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyint_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; + + def int_experimental_constrained_uitofp : Intrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyint_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_fptrunc : Intrinsic<[ llvm_anyfloat_ty ], [ llvm_anyfloat_ty, llvm_metadata_ty, Index: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -172,8 +172,7 @@ SDValue NewIntValue) const; SDValue ExpandFCOPYSIGN(SDNode *Node) const; SDValue ExpandFABS(SDNode *Node) const; - SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, EVT DestVT, - const SDLoc &dl); + SDValue ExpandLegalINT_TO_FP(SDNode *Node, SDValue &Chain); SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned, const SDLoc &dl); void PromoteLegalFP_TO_INT(SDNode *N, const SDLoc &dl, @@ -1016,6 +1015,8 @@ Action = TLI.getOperationAction(Node->getOpcode(), Node->getOperand(0).getValueType()); break; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::STRICT_LRINT: case ISD::STRICT_LLRINT: case ISD::STRICT_LROUND: @@ -2337,9 +2338,14 @@ /// INT_TO_FP operation of the specified operand when the target requests that /// we expand it. At this point, we know that the result and operand types are /// legal for the target. -SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, - EVT DestVT, - const SDLoc &dl) { +SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node, + SDValue &Chain) { + bool isSigned = (Node->getOpcode() == ISD::STRICT_SINT_TO_FP || + Node->getOpcode() == ISD::SINT_TO_FP); + EVT DestVT = Node->getValueType(0); + SDLoc dl(Node); + unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0; + SDValue Op0 = Node->getOperand(OpNo); EVT SrcVT = Op0.getValueType(); // TODO: Should any fast-math-flags be set for the created nodes? @@ -2387,15 +2393,39 @@ BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // subtract the bias - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias); + SDValue Sub; + if (Node->isStrictFPOpcode()) { + Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other}, + {Node->getOperand(0), Load, Bias}); + } else + Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias); // final result - SDValue Result = DAG.getFPExtendOrRound(Sub, dl, DestVT); + SDValue Result; + if (Node->isStrictFPOpcode()) { + if (!DestVT.bitsEq(Sub.getValueType())) { + std::pair ResultPair; + ResultPair = + DAG.getStrictFPExtendOrRound(Sub, SDValue(Node, 1), dl, DestVT); + Result = ResultPair.first; + Chain = ResultPair.second; + } + else + Result = Sub; + } else + Result = DAG.getFPExtendOrRound(Sub, dl, DestVT); return Result; } assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet"); // Code below here assumes !isSigned without checking again. + // FIXME: This can produce slightly incorrect results. See details in + // FIXME: https://reviews.llvm.org/D69275 - SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0); + SDValue Tmp1; + if (Node->isStrictFPOpcode()) { + Tmp1 = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, { DestVT, MVT::Other }, + { Node->getOperand(0), Op0 }); + } else + Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0); SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(SrcVT), Op0, DAG.getConstant(0, dl, SrcVT), ISD::SETLT); @@ -2441,6 +2471,13 @@ FudgeInReg = Handle.getValue(); } + if (Node->isStrictFPOpcode()) { + SDValue Result = DAG.getNode(ISD::STRICT_FADD, dl, { DestVT, MVT::Other }, + { Tmp1.getValue(1), Tmp1, FudgeInReg }); + Chain = Result.getValue(1); + return Result; + } + return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg); } @@ -2898,15 +2935,20 @@ break; } case ISD::UINT_TO_FP: - if (TLI.expandUINT_TO_FP(Node, Tmp1, DAG)) { + case ISD::STRICT_UINT_TO_FP: + if (TLI.expandUINT_TO_FP(Node, Tmp1, Tmp2, DAG)) { Results.push_back(Tmp1); + if (Node->isStrictFPOpcode()) + Results.push_back(Tmp2); break; } LLVM_FALLTHROUGH; case ISD::SINT_TO_FP: - Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP, - Node->getOperand(0), Node->getValueType(0), dl); + case ISD::STRICT_SINT_TO_FP: + Tmp1 = ExpandLegalINT_TO_FP(Node, Tmp2); Results.push_back(Tmp1); + if (Node->isStrictFPOpcode()) + Results.push_back(Tmp2); break; case ISD::FP_TO_SINT: if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG)) Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -307,23 +307,27 @@ return TranslateLegalizeResults(Op, Result); TargetLowering::LegalizeAction Action = TargetLowering::Legal; + EVT ValVT; switch (Op.getOpcode()) { default: return TranslateLegalizeResults(Op, Result); #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: #include "llvm/IR/ConstrainedOps.def" - Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); + ValVT = Node->getValueType(0); + if (Op.getOpcode() == ISD::STRICT_SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_UINT_TO_FP) + ValVT = Node->getOperand(1).getValueType(); + Action = TLI.getOperationAction(Node->getOpcode(), ValVT); // If we're asked to expand a strict vector floating-point operation, // by default we're going to simply unroll it. That is usually the // best approach, except in the case where the resulting strict (scalar) // operations would themselves use the fallback mutation to non-strict. // In that specific case, just do the fallback on the vector op. if (Action == TargetLowering::Expand && !TLI.isStrictFPEnabled() && - TLI.getStrictFPOperationAction(Node->getOpcode(), - Node->getValueType(0)) - == TargetLowering::Legal) { - EVT EltVT = Node->getValueType(0).getVectorElementType(); + TLI.getStrictFPOperationAction(Node->getOpcode(), ValVT) == + TargetLowering::Legal) { + EVT EltVT = ValVT.getVectorElementType(); if (TLI.getOperationAction(Node->getOpcode(), EltVT) == TargetLowering::Expand && TLI.getStrictFPOperationAction(Node->getOpcode(), EltVT) @@ -1153,17 +1157,27 @@ } SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) { - EVT VT = Op.getOperand(0).getValueType(); + bool IsStrict = Op.getNode()->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + EVT VT = Op.getOperand(OpNo).getValueType(); SDLoc DL(Op); // Attempt to expand using TargetLowering. SDValue Result; - if (TLI.expandUINT_TO_FP(Op.getNode(), Result, DAG)) + SDValue Chain; + if (TLI.expandUINT_TO_FP(Op.getNode(), Result, Chain, DAG)) { + if (IsStrict) + // Relink the chain + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Chain); return Result; + } // Make sure that the SINT_TO_FP and SRL instructions are available. - if (TLI.getOperationAction(ISD::SINT_TO_FP, VT) == TargetLowering::Expand || - TLI.getOperationAction(ISD::SRL, VT) == TargetLowering::Expand) + if (((!IsStrict && TLI.getOperationAction(ISD::SINT_TO_FP, VT) == + TargetLowering::Expand) || + (IsStrict && TLI.getOperationAction(ISD::STRICT_SINT_TO_FP, VT) == + TargetLowering::Expand)) || + TLI.getOperationAction(ISD::SRL, VT) == TargetLowering::Expand) return DAG.UnrollVectorOp(Op.getNode()); unsigned BW = VT.getScalarSizeInBits(); @@ -1185,6 +1199,29 @@ SDValue HI = DAG.getNode(ISD::SRL, DL, VT, Op.getOperand(0), HalfWord); SDValue LO = DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), HalfWordMask); + if (IsStrict) { + // Convert hi and lo to floats + // Convert the hi part back to the upper values + // TODO: Can any fast-math-flags be set on these nodes? + SDValue fHI = + DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {Op.getValueType(), MVT::Other}, + {Op.getOperand(0), HI}); + fHI = DAG.getNode(ISD::STRICT_FMUL, DL, {Op.getValueType(), MVT::Other}, + {SDValue(fHI.getNode(), 1), fHI, TWOHW}); + SDValue fLO = + DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {Op.getValueType(), MVT::Other}, + {SDValue(fHI.getNode(), 1), LO}); + + // Add the two halves + SDValue Result = + DAG.getNode(ISD::STRICT_FADD, DL, {Op.getValueType(), MVT::Other}, + {SDValue(fLO.getNode(), 1), fHI, fLO}); + + // Relink the chain + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), SDValue(Result.getNode(), 1)); + return Result; + } + // Convert hi and lo to floats // Convert the hi part back to the upper values // TODO: Can any fast-math-flags be set on these nodes? Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -587,6 +587,8 @@ case ISD::UINT_TO_FP: Res = ScalarizeVecOp_UnaryOp(N); break; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: Res = ScalarizeVecOp_UnaryOp_StrictFP(N); @@ -1261,6 +1263,8 @@ case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: SplitVecRes_UnaryOp(N, Lo, Hi); return; default: @@ -1977,9 +1981,12 @@ case ISD::VSELECT: Res = SplitVecOp_VSELECT(N, OpNo); break; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: - if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType())) + if (N->getValueType(0).bitsLT( + N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType())) Res = SplitVecOp_TruncateHelper(N); else Res = SplitVecOp_UnaryOp(N); @@ -2540,7 +2547,8 @@ // // Without this transform, the original truncate would end up being // scalarized, which is pretty much always a last resort. - SDValue InVec = N->getOperand(0); + unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0; + SDValue InVec = N->getOperand(OpNo); EVT InVT = InVec->getValueType(0); EVT OutVT = N->getValueType(0); unsigned NumElements = OutVT.getVectorNumElements(); @@ -2584,8 +2592,23 @@ EVT::getIntegerVT(*DAG.getContext(), InElementSize/2); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements/2); - SDValue HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec); - SDValue HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec); + + SDValue HalfLo; + SDValue HalfHi; + SDValue Chain; + if (N->isStrictFPOpcode()) { + HalfLo = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other}, + {N->getOperand(0), HalfLo}); + HalfHi = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other}, + {N->getOperand(0), HalfHi}); + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, HalfLo.getValue(1), + HalfHi.getValue(1)); + } else { + HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec); + HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec); + } // Concatenate them to get the full intermediate truncation result. EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements); SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo, @@ -2594,6 +2617,17 @@ // type. This should normally be something that ends up being legal directly, // but in theory if a target has very wide vectors and an annoyingly // restricted set of legal types, this split can chain to build things up. + + if (N->isStrictFPOpcode()) { + SDValue Res = DAG.getNode( + ISD::STRICT_FP_ROUND, DL, {OutVT, MVT::Other}, + {Chain, InterVec, + DAG.getTargetConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()))}); + // Relink the chain + ReplaceValueWith(SDValue(N, 1), SDValue(Res.getNode(), 1)); + return Res; + } + return IsFloat ? DAG.getNode(ISD::FP_ROUND, DL, OutVT, InterVec, DAG.getTargetConstant( @@ -3046,6 +3080,8 @@ case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: return WidenVecRes_Convert_StrictFP(N); default: break; @@ -4128,7 +4164,9 @@ case ISD::FP_TO_UINT: case ISD::STRICT_FP_TO_UINT: case ISD::SINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: case ISD::UINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::TRUNCATE: Res = WidenVecOp_Convert(N); break; Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1113,6 +1113,20 @@ : getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL)); } +std::pair +SelectionDAG::getStrictFPExtendOrRound(SDValue Op, SDValue Chain, + const SDLoc &DL, EVT VT) { + assert(!VT.bitsEq(Op.getValueType()) && + "Strict no-op FP extend/round not allowed."); + SDValue Res = + VT.bitsGT(Op.getValueType()) + ? getNode(ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other}, {Chain, Op}) + : getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other}, + {Chain, Op, getIntPtrConstant(0, DL)}); + + return std::pair(Res, SDValue(Res.getNode(), 1)); +} + SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::ANY_EXTEND, DL, VT, Op) : @@ -9129,16 +9143,25 @@ } SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { - assert(N->getNumValues() == 1 && + assert((N->getNumValues() == 1 || + (N->getNumValues() == 2 && N->getValueType(1) == MVT::Other)) && "Can't unroll a vector with multiple results!"); + SDValue Chain; + bool IsChained = (N->getNumValues() == 2); EVT VT = N->getValueType(0); unsigned NE = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); SDLoc dl(N); SmallVector Scalars; + SmallVector Chains; SmallVector Operands(N->getNumOperands()); + SmallVector EltVTs = {EltVT}; + if (IsChained) { + EltVTs.push_back(MVT::Other); + Chain = N->getOperand(0); + } // If ResNE is 0, fully unroll the vector op. if (ResNE == 0) @@ -9148,7 +9171,10 @@ unsigned i; for (i= 0; i != NE; ++i) { - for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) { + if (IsChained) + Operands[0] = Chain; + for (unsigned j = (IsChained ? 1 : 0), e = N->getNumOperands(); j != e; + ++j) { SDValue Operand = N->getOperand(j); EVT OperandVT = Operand.getValueType(); if (OperandVT.isVector()) { @@ -9165,8 +9191,13 @@ switch (N->getOpcode()) { default: { - Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands, - N->getFlags())); + // Use EltVTs here since the node may be chained. Singular EltVT + // is used for cases where we know there is no chain. + SDValue Scalar = getNode(N->getOpcode(), dl, EltVTs, Operands); + Scalar.getNode()->setFlags(N->getFlags()); + Scalars.push_back(Scalar); + if (IsChained) + Chains.push_back(Scalar.getValue(1)); break; } case ISD::VSELECT: @@ -9190,6 +9221,12 @@ } } + if (IsChained) { + // Build a new factor node to connect the chain back together. + SDValue OutChain = getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + ReplaceAllUsesOfValueWith(SDValue(N, 1), OutChain); + } + for (; i < ResNE; ++i) Scalars.push_back(getUNDEF(EltVT)); Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -324,7 +324,9 @@ case ISD::STRICT_FP_EXTEND: return "strict_fp_extend"; case ISD::SINT_TO_FP: return "sint_to_fp"; + case ISD::STRICT_SINT_TO_FP: return "strict_sint_to_fp"; case ISD::UINT_TO_FP: return "uint_to_fp"; + case ISD::STRICT_UINT_TO_FP: return "strict_uint_to_fp"; case ISD::FP_TO_SINT: return "fp_to_sint"; case ISD::STRICT_FP_TO_SINT: return "strict_fp_to_sint"; case ISD::FP_TO_UINT: return "fp_to_uint"; Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6104,8 +6104,10 @@ } bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, + SDValue &Chain, SelectionDAG &DAG) const { - SDValue Src = Node->getOperand(0); + unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0; + SDValue Src = Node->getOperand(OpNo); EVT SrcVT = Src.getValueType(); EVT DstVT = Node->getValueType(0); @@ -6128,7 +6130,13 @@ // For unsigned conversions, convert them to signed conversions using the // algorithm from the x86_64 __floatundidf in compiler_rt. - SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); + SDValue Fast; + if (Node->isStrictFPOpcode()) { + Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other}, + {Node->getOperand(0), Src}); + Chain = SDValue(Fast.getNode(), 1); + } else + Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT); SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Src, ShiftConst); @@ -6136,8 +6144,17 @@ SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Src, AndConst); SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr); - SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or); - SDValue Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt); + SDValue Slow; + if (Node->isStrictFPOpcode()) { + SDValue SignCvt = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, + {DstVT, MVT::Other}, {Chain, Or}); + Slow = DAG.getNode(ISD::STRICT_FADD, dl, { DstVT, MVT::Other }, + { SignCvt.getValue(1), SignCvt, SignCvt }); + Chain = Slow.getValue(1); + } else { + SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or); + Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt); + } // TODO: This really should be implemented using a branch rather than a // select. We happen to get lucky and machinesink does the right @@ -6180,8 +6197,18 @@ SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84); SDValue LoFlt = DAG.getBitcast(DstVT, LoOr); SDValue HiFlt = DAG.getBitcast(DstVT, HiOr); - SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); - Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); + if (Node->isStrictFPOpcode()) { + SDValue HiSub = + DAG.getNode(ISD::STRICT_FSUB, dl, {DstVT, MVT::Other}, + {Node->getOperand(0), HiFlt, TwoP84PlusTwoP52}); + Result = DAG.getNode(ISD::STRICT_FADD, dl, {DstVT, MVT::Other}, + {HiSub.getValue(1), LoFlt, HiSub}); + Chain = Result.getValue(1); + } else { + SDValue HiSub = + DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); + Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); + } return true; } Index: llvm/lib/IR/Verifier.cpp =================================================================== --- llvm/lib/IR/Verifier.cpp +++ llvm/lib/IR/Verifier.cpp @@ -4785,6 +4785,28 @@ } break; + case Intrinsic::experimental_constrained_sitofp: + case Intrinsic::experimental_constrained_uitofp: { + Value *Operand = FPI.getArgOperand(0); + uint64_t NumSrcElem = 0; + Assert(Operand->getType()->isIntOrIntVectorTy(), + "Intrinsic first argument must be integer", &FPI); + if (auto *OperandT = dyn_cast(Operand->getType())) { + NumSrcElem = OperandT->getNumElements(); + } + + Operand = &FPI; + Assert((NumSrcElem > 0) == Operand->getType()->isVectorTy(), + "Intrinsic first argument and result disagree on vector use", &FPI); + Assert(Operand->getType()->isFPOrFPVectorTy(), + "Intrinsic result must be a floating point", &FPI); + if (auto *OperandT = dyn_cast(Operand->getType())) { + Assert(NumSrcElem == OperandT->getNumElements(), + "Intrinsic first argument and result vector lengths must be equal", + &FPI); + } + } break; + case Intrinsic::experimental_constrained_fptrunc: case Intrinsic::experimental_constrained_fpext: { Value *Operand = FPI.getArgOperand(0); Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -233,9 +233,11 @@ // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. @@ -245,9 +247,11 @@ setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. @@ -981,9 +985,12 @@ setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); @@ -18398,8 +18405,12 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Op.getOpcode() == ISD::SINT_TO_FP || - Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"); - SDValue Src = Op.getOperand(0); + Op.getOpcode() == ISD::STRICT_SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_UINT_TO_FP || + Op.getOpcode() == ISD::UINT_TO_FP) && + "Unexpected opcode!"); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); @@ -18416,7 +18427,17 @@ SDLoc dl(Op); SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src); + if (Op.getNode()->isStrictFPOpcode()) { + SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other}, + {Op.getOperand(0), InVec}); + SDValue Chain = CvtVec.getValue(1); + SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, + DAG.getIntPtrConstant(0, dl)); + return DAG.getMergeValues({Value, Chain}, dl); + } + SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); } @@ -18487,7 +18508,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -18496,7 +18518,9 @@ return Extract; if (SrcVT.isVector()) { - if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { + if (SrcVT == MVT::v2i32 && VT == MVT::v2f64 && + !Op.getNode()->isStrictFPOpcode()) { + // FIXME: A strict version of CVTSI2P is needed. return DAG.getNode(X86ISD::CVTSI2P, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))); @@ -18528,7 +18552,7 @@ if (VT == MVT::f128) return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); - SDValue ValueToStore = Op.getOperand(0); + SDValue ValueToStore = Op.getOperand(OpNo); if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come @@ -18630,6 +18654,7 @@ #endif */ + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); @@ -18650,8 +18675,8 @@ SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); // Load the 64-bit value into an XMM register. - SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, - Op.getOperand(0)); + SDValue XR1 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), @@ -18664,15 +18689,33 @@ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /* Alignment = */ 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); + SDValue Sub; + SDValue Chain; // TODO: Are there any fast-math-flags to propagate here? - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + if (Op.getNode()->isStrictFPOpcode()) { + Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, + {Op.getOperand(0), XR2F, CLod1}); + Chain = Sub.getValue(1); + } else + Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { + // FIXME: Do we need a STRICT version of FHADD? Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); - Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); + if (Op.getNode()->isStrictFPOpcode()) { + Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other}, + {Chain, Shuffle, Sub}); + Chain = Result.getValue(1); + } else + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); + } + if (Op.getNode()->isStrictFPOpcode()) { + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, + DAG.getIntPtrConstant(0, dl)); + return DAG.getMergeValues({Result, Chain}, dl); } return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, @@ -18682,14 +18725,15 @@ /// 32-bit unsigned integer to float expansion. static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; SDLoc dl(Op); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // Load the 32-bit value into an XMM register. - SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, - Op.getOperand(0)); + SDValue Load = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo)); // Zero out the upper parts of the register. Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); @@ -18709,6 +18753,23 @@ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); + if (Op.getNode()->isStrictFPOpcode()) { + // Subtract the bias. + // TODO: Are there any fast-math-flags to propagate here? + SDValue Chain = Op.getOperand(0); + SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other}, + {Chain, Or, Bias}); + + if (!Op.getValueType().bitsEq(Sub.getValueType())) { + // Handle final rounding. + std::pair ResultPair = DAG.getStrictFPExtendOrRound( + Sub, Sub.getValue(1), dl, Op.getSimpleValueType()); + + return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl); + } + return Sub; + } + // Subtract the bias. // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); @@ -18723,6 +18784,10 @@ if (Op.getSimpleValueType() != MVT::v2f64) return SDValue(); + // FIXME: Need to fix the lack of StrictFP support here. + if (Op.getNode()->isStrictFPOpcode()) + return SDValue(); + SDValue N0 = Op.getOperand(0); assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"); @@ -18849,7 +18914,8 @@ static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - SDValue N0 = Op.getOperand(0); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue N0 = Op.getOperand(OpNo); MVT SrcVT = N0.getSimpleValueType(); SDLoc dl(Op); @@ -18867,11 +18933,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue N0 = Op.getOperand(0); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue N0 = Op.getOperand(OpNo); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); + SDValue Chain = + Op.getNode()->isStrictFPOpcode() ? Op.getOperand(0) : DAG.getEntryNode(); if (DstVT == MVT::f128) return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT)); @@ -18892,6 +18961,10 @@ // Promote i32 to i64 and use a signed conversion on 64-bit targets. if (SrcVT == MVT::i32 && Subtarget.is64Bit()) { N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0); + if (Op.getNode()->isStrictFPOpcode()) { + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other}, + {Chain, N0}); + } return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0); } @@ -18909,8 +18982,8 @@ SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl); - SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, MachinePointerInfo()); + SDValue Store1 = DAG.getStore(Chain, dl, Op.getOperand(OpNo), StackSlot, + MachinePointerInfo()); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), OffsetSlot, MachinePointerInfo()); SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); @@ -18918,14 +18991,14 @@ } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); - SDValue ValueToStore = Op.getOperand(0); + SDValue ValueToStore = Op.getOperand(OpNo); if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, - MachinePointerInfo()); + SDValue Store = + DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo()); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, @@ -18940,13 +19013,14 @@ SDValue Ops[] = { Store, StackSlot }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); + Chain = Fild.getValue(1); APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. SDValue SignSet = DAG.getSetCC( dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), - Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); + Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. SDValue FudgePtr = DAG.getConstantPool( @@ -18961,11 +19035,18 @@ // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad( - ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, + ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, /* Alignment = */ 4); + Chain = Fudge.getValue(1); // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? + if (Op.getNode()->isStrictFPOpcode()) { + SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other}, + {Chain, Fild, Fudge}); + return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other}, + {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)}); + } SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); @@ -27857,7 +27938,9 @@ case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::FSHL: case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); + case ISD::STRICT_SINT_TO_FP: case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::STRICT_UINT_TO_FP: case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); Index: llvm/test/CodeGen/X86/fp-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/fp-intrinsics.ll +++ llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -1950,6 +1950,426 @@ ret i64 %result } +; Verify that sitofp(%x) isn't simplified when the rounding mode is +; unknown. The expansion should have only one conversion instruction. +; Verify that no gross errors happen. +define double @sifdi(i32 %x) #0 { +; X87-LABEL: sifdi: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildl (%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: sifdi: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: cvtsi2sdl {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: sifdi: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsi2sd %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sifdi: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @siffi(i32 %x) #0 { +; X87-LABEL: siffi: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildl (%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: siffi: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: siffi: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsi2ss %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: siffi: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define double @sifdl(i64 %x) #0 { +; X87-LABEL: sifdl: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X87-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildll (%esp) +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: sifdl: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fstpl (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: sifdl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsi2sd %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sifdl: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @siffl(i64 %x) #0 { +; X87-LABEL: siffl: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X87-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildll (%esp) +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: siffl: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: flds {{[0-9]+}}(%esp) +; X86-SSE-NEXT: addl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: siffl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsi2ss %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: siffl: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.sitofp.f32.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +; Verify that uitofp(%x) isn't simplified when the rounding mode is +; unknown. Expansions from i32 should have only one conversion instruction. +; Verify that no gross errors happen. +define double @uifdi(i32 %x) #0 { +; X87-LABEL: uifdi: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: movl $0, {{[0-9]+}}(%esp) +; X87-NEXT: fildll (%esp) +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uifdi: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: orpd %xmm0, %xmm1 +; X86-SSE-NEXT: subsd %xmm0, %xmm1 +; X86-SSE-NEXT: movsd %xmm1, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uifdi: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: cvtsi2sd %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uifdi: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: uifdi: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define double @uifdl(i64 %x) #0 { +; X87-LABEL: uifdl: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $20, %esp +; X87-NEXT: .cfi_def_cfa_offset 24 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X87-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: xorl %eax, %eax +; X87-NEXT: testl %ecx, %ecx +; X87-NEXT: setns %al +; X87-NEXT: fildll (%esp) +; X87-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; X87-NEXT: fstpl {{[0-9]+}}(%esp) +; X87-NEXT: fldl {{[0-9]+}}(%esp) +; X87-NEXT: addl $20, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uifdl: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X86-SSE-NEXT: subpd {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: movapd %xmm0, %xmm1 +; X86-SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; X86-SSE-NEXT: addpd %xmm0, %xmm1 +; X86-SSE-NEXT: movlpd %xmm1, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uifdl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movq %rdi, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: subpd {{.*}}(%rip), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uifdl: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: uifdl: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @uiffi(i32 %x) #0 { +; X87-LABEL: uiffi: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: movl $0, {{[0-9]+}}(%esp) +; X87-NEXT: fildll (%esp) +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uiffi: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: orpd %xmm0, %xmm1 +; X86-SSE-NEXT: subsd %xmm0, %xmm1 +; X86-SSE-NEXT: xorps %xmm0, %xmm0 +; X86-SSE-NEXT: cvtsd2ss %xmm1, %xmm0 +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uiffi: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: cvtsi2ss %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uiffi: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: uiffi: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define float @uiffl(i64 %x) #0 { +; X87-LABEL: uiffl: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $20, %esp +; X87-NEXT: .cfi_def_cfa_offset 24 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X87-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X87-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X87-NEXT: xorl %eax, %eax +; X87-NEXT: testl %ecx, %ecx +; X87-NEXT: setns %al +; X87-NEXT: fildll {{[0-9]+}}(%esp) +; X87-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; X87-NEXT: fstps {{[0-9]+}}(%esp) +; X87-NEXT: flds {{[0-9]+}}(%esp) +; X87-NEXT: addl $20, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uiffl: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: setns %al +; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: addl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uiffl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: js .LBB44_1 +; SSE-NEXT: # %bb.2: # %entry +; SSE-NEXT: cvtsi2ss %rdi, %xmm0 +; SSE-NEXT: retq +; SSE-NEXT: .LBB44_1: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: shrq %rax +; SSE-NEXT: andl $1, %edi +; SSE-NEXT: orq %rax, %rdi +; SSE-NEXT: cvtsi2ss %rdi, %xmm0 +; SSE-NEXT: addss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uiffl: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: testq %rdi, %rdi +; AVX1-NEXT: js .LBB44_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB44_1: +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: orq %rax, %rdi +; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: uiffl: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.uitofp.f32.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + attributes #0 = { strictfp } @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" @@ -1990,3 +2410,11 @@ declare i32 @llvm.experimental.constrained.lround.i32.f32(float, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f32(float, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) +declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i64(i64, metadata, metadata) +declare float @llvm.experimental.constrained.sitofp.f32.i64(i64, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i64(i64, metadata, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i32(i32, metadata, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i64(i64, metadata, metadata) Index: llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -5690,6 +5690,1539 @@ ret <3 x double> %trunc } +define <1 x double> @constrained_vector_sitofp_v1f64_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f64_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2sd %edi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.sitofp.v1f64.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_sitofp_v1f32_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f32_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2ss %edi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.sitofp.v1f32.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <1 x double> @constrained_vector_sitofp_v1f64_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f64_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2sd %rdi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_sitofp_v1f32_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f32_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.sitofp.v1f32.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <2 x double> @constrained_vector_sitofp_v2f64_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f64_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %eax, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f64_v2i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vextractps $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_sitofp_v2f32_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f32_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %eax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f32_v2i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vextractps $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <2 x double> @constrained_vector_sitofp_v2f64_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f64_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f64_v2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_sitofp_v2f32_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f32_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f32_v2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %eax, %xmm0 +; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v3f64_v3i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vextractps $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm2, %xmm2 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm3, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ss %eax, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %eax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v3f32_v3i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vextractps $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm3, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <3 x double> @constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2sd %rdi, %xmm0 +; CHECK-NEXT: cvtsi2sd %rsi, %xmm1 +; CHECK-NEXT: cvtsi2sd %rdx, %xmm2 +; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_sitofp_v3f64_v3i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sitofp_v3f64_v3i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_sitofp_v3f32_v3i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sitofp_v3f32_v3i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <4 x double> @constrained_vector_sitofp_v4f64_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v4f64_v4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.sitofp.v4f64.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_sitofp_v4f32_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f32_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtdq2ps %xmm0, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v4f32_v4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.sitofp.v4f32.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + +define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: cvtsi2sd %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: cvtsi2sd %rax, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: movapd %xmm3, %xmm1 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_sitofp_v4f64_v4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sitofp_v4f64_v4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.sitofp.v4f64.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f32_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_sitofp_v4f32_v4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sitofp_v4f32_v4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + +define <1 x double> @constrained_vector_uitofp_v1f64_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.uitofp.v1f64.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_uitofp_v1f32_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f32_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v1f32_v1i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.uitofp.v1f32.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <1 x double> @constrained_vector_uitofp_v1f64_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: subpd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: addpd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_uitofp_v1f32_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f32_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: js .LBB170_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB170_1: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: orq %rax, %rdi +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v1f32_v1i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: testq %rdi, %rdi +; AVX1-NEXT: js .LBB170_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB170_1: +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: orq %rax, %rdi +; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.uitofp.v1f32.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <2 x double> @constrained_vector_uitofp_v2f64_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v2f64_v2i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm2, %xmm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.uitofp.v2f64.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_uitofp_v2f32_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f32_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.uitofp.v2f32.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: movapd {{.*#+}} xmm4 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm4, %xmm0 +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: addpd %xmm0, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-NEXT: subpd %xmm4, %xmm3 +; CHECK-NEXT: movapd %xmm3, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; CHECK-NEXT: addpd %xmm3, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] +; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vsubpd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v2f64_v2i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f32_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB174_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jns .LBB174_5 +; CHECK-NEXT: .LBB174_4: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB174_1: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB174_4 +; CHECK-NEXT: .LBB174_5: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB174_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: jns .LBB174_5 +; AVX1-NEXT: .LBB174_4: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB174_1: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB174_4 +; AVX1-NEXT: .LBB174_5: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.uitofp.v2f32.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm2, %xmm2 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.uitofp.v3f64.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.uitofp.v3f32.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm3, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: addpd %xmm1, %xmm0 +; CHECK-NEXT: movq %rsi, %xmm4 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-NEXT: subpd %xmm3, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm1 +; CHECK-NEXT: movq %rdx, %xmm4 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-NEXT: subpd %xmm3, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm2 +; CHECK-NEXT: movlpd %xmm2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] +; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-NEXT: vsubpd %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm4[1,0] +; AVX1-NEXT: vaddpd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vsubpd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.uitofp.v3f64.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: js .LBB178_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: jns .LBB178_5 +; CHECK-NEXT: .LBB178_4: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: orq %rax, %rdi +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: jns .LBB178_8 +; CHECK-NEXT: .LBB178_7: +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rax, %rdx +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB178_1: +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: orq %rax, %rsi +; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: js .LBB178_4 +; CHECK-NEXT: .LBB178_5: # %entry +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: js .LBB178_7 +; CHECK-NEXT: .LBB178_8: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB178_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: jns .LBB178_5 +; AVX1-NEXT: .LBB178_4: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: jmp .LBB178_6 +; AVX1-NEXT: .LBB178_1: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB178_4 +; AVX1-NEXT: .LBB178_5: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: .LBB178_6: # %entry +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB178_7 +; AVX1-NEXT: # %bb.8: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB178_7: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.uitofp.v3f32.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <4 x double> @constrained_vector_uitofp_v4f64_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractps $3, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vextractps $2, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v4f64_v4i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractps $3, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm1, %xmm1 +; AVX512-NEXT: vextractps $2, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm2, %xmm2 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm3, %xmm2 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm3, %xmm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.uitofp.v4f64.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_uitofp_v4f32_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f32_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v4f32_v4i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm3, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.uitofp.v4f32.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + +define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: movapd {{.*#+}} xmm5 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm5, %xmm2 +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; CHECK-NEXT: addpd %xmm2, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-NEXT: subpd %xmm5, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm2 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-NEXT: subpd %xmm5, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; CHECK-NEXT: addpd %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-NEXT: subpd %xmm5, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: movapd %xmm2, %xmm1 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503599627370496E+15,1.9342813113834067E+25] +; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] +; AVX1-NEXT: vaddpd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX1-NEXT: vaddpd %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] +; AVX1-NEXT: vaddpd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v4f64_v4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.uitofp.v4f64.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f32_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jns .LBB182_5 +; CHECK-NEXT: .LBB182_4: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm3 +; CHECK-NEXT: addss %xmm3, %xmm3 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jns .LBB182_8 +; CHECK-NEXT: .LBB182_7: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: jmp .LBB182_9 +; CHECK-NEXT: .LBB182_1: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: addss %xmm2, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_4 +; CHECK-NEXT: .LBB182_5: # %entry +; CHECK-NEXT: cvtsi2ss %rax, %xmm3 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_7 +; CHECK-NEXT: .LBB182_8: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: .LBB182_9: # %entry +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_10 +; CHECK-NEXT: # %bb.11: # %entry +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB182_10: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB182_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: jns .LBB182_5 +; AVX1-NEXT: .LBB182_4: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: jmp .LBB182_6 +; AVX1-NEXT: .LBB182_1: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB182_4 +; AVX1-NEXT: .LBB182_5: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: .LBB182_6: # %entry +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB182_7 +; AVX1-NEXT: # %bb.8: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: jns .LBB182_11 +; AVX1-NEXT: .LBB182_10: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB182_7: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB182_10 +; AVX1-NEXT: .LBB182_11: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v4f32_v4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + attributes #0 = { strictfp } ; Single width declarations @@ -5726,6 +7259,14 @@ declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.uitofp.v2f32.v2i32(<2 x i32>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.uitofp.v2f32.v2i64(<2 x i64>, metadata, metadata) ; Scalar width declarations declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata) @@ -5761,6 +7302,14 @@ declare <1 x float> @llvm.experimental.constrained.floor.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.round.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.trunc.v1f32(<1 x float>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i32(<1 x i32>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.sitofp.v1f32.v1i32(<1 x i32>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.sitofp.v1f32.v1i64(<1 x i64>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i32(<1 x i32>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.uitofp.v1f32.v1i32(<1 x i32>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.uitofp.v1f32.v1i64(<1 x i64>, metadata, metadata) ; Illegal width declarations declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata) @@ -5819,6 +7368,14 @@ declare <3 x double> @llvm.experimental.constrained.round.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.trunc.v3f32(<3 x float>, metadata, metadata) declare <3 x double> @llvm.experimental.constrained.trunc.v3f64(<3 x double>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.uitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.uitofp.v3f32.v3i32(<3 x i32>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.uitofp.v3f64.v3i64(<3 x i64>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.uitofp.v3f32.v3i64(<3 x i64>, metadata, metadata) ; Double width declarations declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata) @@ -5854,3 +7411,12 @@ declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.sitofp.v4f64.v4i32(<4 x i32>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i32(<4 x i32>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.sitofp.v4f64.v4i64(<4 x i64>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.uitofp.v4f64.v4i32(<4 x i32>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i32(<4 x i32>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.uitofp.v4f64.v4i64(<4 x i64>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64>, metadata, metadata) + Index: llvm/test/Feature/fp-intrinsics.ll =================================================================== --- llvm/test/Feature/fp-intrinsics.ll +++ llvm/test/Feature/fp-intrinsics.ll @@ -373,6 +373,28 @@ ret i64 %result } +; Verify that sitofp(42) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: @f30 +; CHECK: call double @llvm.experimental.constrained.sitofp +define double @f30() #0 { +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 42, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +; Verify that uitofp(42) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: @f31 +; CHECK: call double @llvm.experimental.constrained.uitofp +define double @f31() #0 { +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 42, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + attributes #0 = { strictfp } @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" @@ -405,3 +427,5 @@ declare i32 @llvm.experimental.constrained.lround.i32.f32(float, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f32(float, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata)