Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -15558,6 +15558,78 @@ The result produced is a signed integer converted from the floating point operand. The value is truncated, so it is rounded towards zero. +'``llvm.experimental.constrained.uitofp``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.uitofp( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.uitofp``' intrinsic converts an +unsigned integer ``value`` to a floating-point of type ``ty2``. + +Arguments: +"""""""""" + +The first argument to the '``llvm.experimental.constrained.uitofp``' +intrinsic must be an :ref:`integer ` or :ref:`vector +` of integer values. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +An inexact floating-point exception will be raised if rounding is required. +Any result produced is a floating point value converted from the input +integer operand. + +'``llvm.experimental.constrained.sitofp``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.sitofp( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.sitofp``' intrinsic converts a +signed integer ``value`` to a floating-point of type ``ty2``. + +Arguments: +"""""""""" + +The first argument to the '``llvm.experimental.constrained.sitofp``' +intrinsic must be an :ref:`integer ` or :ref:`vector +` of integer values. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +An inexact floating-point exception will be raised if rounding is required. +Any result produced is a floating point value converted from the input +integer operand. + '``llvm.experimental.constrained.fptrunc``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: llvm/include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- llvm/include/llvm/CodeGen/ISDOpcodes.h +++ llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -310,6 +310,13 @@ STRICT_FP_TO_SINT, STRICT_FP_TO_UINT, + /// STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to + /// a floating point value. These have the same semantics as sitofp and + /// uitofp in IR. + /// They are used to limit optimizations while the DAG is being optimized. + STRICT_SINT_TO_FP, + STRICT_UINT_TO_FP, + /// X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating /// point type down to the precision of the destination VT. TRUNC is a /// flag, which is always an integer that is zero or one. If TRUNC is 0, Index: llvm/include/llvm/CodeGen/SelectionDAG.h =================================================================== --- llvm/include/llvm/CodeGen/SelectionDAG.h +++ llvm/include/llvm/CodeGen/SelectionDAG.h @@ -811,6 +811,11 @@ /// float type VT, by either extending or rounding (by truncation). SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT); + /// Convert Op, which must be a STRICT operation of float type, to the + /// float type VT, by either extending or rounding (by truncation). + std::pair + getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT); + /// Convert Op, which must be of integer type, to the /// integer type VT, by either any-extending or truncating it. SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT); Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -4121,14 +4121,18 @@ /// Expand float to UINT conversion /// \param N Node to expand /// \param Result output after conversion + /// \param Output output chain after conversion /// \returns True, if the expansion was successful, false otherwise - bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SDValue &Chain, SelectionDAG &DAG) const; + bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SDValue &Chain, + SelectionDAG &DAG) const; /// Expand UINT(i64) to double(f64) conversion /// \param N Node to expand /// \param Result output after conversion + /// \param Output output chain after conversion /// \returns True, if the expansion was successful, false otherwise - bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SDValue &Chain, + SelectionDAG &DAG) const; /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs. SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const; Index: llvm/include/llvm/IR/ConstrainedOps.def =================================================================== --- llvm/include/llvm/IR/ConstrainedOps.def +++ llvm/include/llvm/IR/ConstrainedOps.def @@ -36,6 +36,8 @@ INSTRUCTION(FDiv, 2, 1, experimental_constrained_fdiv, FDIV) INSTRUCTION(FRem, 2, 1, experimental_constrained_frem, FREM) INSTRUCTION(FPExt, 1, 0, experimental_constrained_fpext, FP_EXTEND) +INSTRUCTION(SIToFP, 1, 1, experimental_constrained_sitofp, SINT_TO_FP) +INSTRUCTION(UIToFP, 1, 1, experimental_constrained_uitofp, UINT_TO_FP) INSTRUCTION(FPToSI, 1, 0, experimental_constrained_fptosi, FP_TO_SINT) INSTRUCTION(FPToUI, 1, 0, experimental_constrained_fptoui, FP_TO_UINT) INSTRUCTION(FPTrunc, 1, 1, experimental_constrained_fptrunc, FP_ROUND) Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -640,6 +640,16 @@ [ llvm_anyfloat_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_sitofp : Intrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyint_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; + + def int_experimental_constrained_uitofp : Intrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyint_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_fptrunc : Intrinsic<[ llvm_anyfloat_ty ], [ llvm_anyfloat_ty, llvm_metadata_ty, Index: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -172,10 +172,10 @@ SDValue NewIntValue) const; SDValue ExpandFCOPYSIGN(SDNode *Node) const; SDValue ExpandFABS(SDNode *Node) const; - SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, EVT DestVT, - const SDLoc &dl); - SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned, - const SDLoc &dl); + void ExpandLegalINT_TO_FP(SDNode *Node, const SDLoc &dl, + SmallVectorImpl &Results); + void PromoteLegalINT_TO_FP(SDNode *N, const SDLoc &dl, + SmallVectorImpl &Results); void PromoteLegalFP_TO_INT(SDNode *N, const SDLoc &dl, SmallVectorImpl &Results); @@ -1016,6 +1016,14 @@ Action = TLI.getOperationAction(Node->getOpcode(), Node->getOperand(0).getValueType()); break; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + // These pseudo-ops are the same as the other STRICT_ ops except + // they are registered with setOperationAction() using the input type + // instead of the output type. + Action = TLI.getOperationAction(Node->getOpcode(), + Node->getOperand(1).getValueType()); + break; case ISD::STRICT_LRINT: case ISD::STRICT_LLRINT: case ISD::STRICT_LROUND: @@ -2337,10 +2345,15 @@ /// INT_TO_FP operation of the specified operand when the target requests that /// we expand it. At this point, we know that the result and operand types are /// legal for the target. -SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, - EVT DestVT, - const SDLoc &dl) { +void SelectionDAGLegalize::ExpandLegalINT_TO_FP( + SDNode *Node, const SDLoc &dl, SmallVectorImpl &Results) { + bool IsStrict = Node->isStrictFPOpcode(); + bool IsSigned = Node->getOpcode() == ISD::SINT_TO_FP || + Node->getOpcode() == ISD::STRICT_SINT_TO_FP; + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Op0 = Node->getOperand(OpNo); EVT SrcVT = Op0.getValueType(); + EVT DestVT = Node->getValueType(0); // TODO: Should any fast-math-flags be set for the created nodes? LLVM_DEBUG(dbgs() << "Legalizing INT_TO_FP\n"); @@ -2363,7 +2376,7 @@ // if signed map to unsigned space SDValue Op0Mapped; - if (isSigned) { + if (IsSigned) { // constant used to invert sign bit (signed to unsigned mapping) SDValue SignBit = DAG.getConstant(0x80000000u, dl, MVT::i32); Op0Mapped = DAG.getNode(ISD::XOR, dl, MVT::i32, Op0, SignBit); @@ -2382,20 +2395,41 @@ SDValue Load = DAG.getLoad(MVT::f64, dl, Store2, StackSlot, MachinePointerInfo()); // FP constant to bias correct the final result - SDValue Bias = DAG.getConstantFP(isSigned ? + SDValue Bias = DAG.getConstantFP(IsSigned ? BitsToDouble(0x4330000080000000ULL) : BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // subtract the bias + if (IsStrict) { + SDValue Result = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other}, + {Node->getOperand(0), Load, Bias}); + SDValue NewChain = Result.getValue(1); + + if (DestVT != Result.getValueType()) { + std::pair ResultPair; + ResultPair = DAG.getStrictFPExtendOrRound(Result, NewChain, dl, DestVT); + Result = ResultPair.first; + NewChain = ResultPair.second; + } + + Results.push_back(Result); + Results.push_back(NewChain); + return; + } + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias); - // final result - SDValue Result = DAG.getFPExtendOrRound(Sub, dl, DestVT); - return Result; + Results.push_back(DAG.getFPExtendOrRound(Sub, dl, DestVT)); + return; } - assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet"); - // Code below here assumes !isSigned without checking again. + assert(!IsSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet"); + // Code below here assumes !IsSigned without checking again. - SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0); + SDValue Tmp1; + if (IsStrict) + Tmp1 = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DestVT, MVT::Other}, + {Node->getOperand(0), Op0}); + else + Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0); SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(SrcVT), Op0, DAG.getConstant(0, dl, SrcVT), ISD::SETLT); @@ -2441,7 +2475,15 @@ FudgeInReg = Handle.getValue(); } - return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg); + if (IsStrict) { + SDValue Result = DAG.getNode(ISD::STRICT_FADD, dl, {DestVT, MVT::Other}, + {Tmp1.getValue(1), Tmp1, FudgeInReg}); + Results.push_back(Result); + Results.push_back(Result.getValue(1)); + return; + } + + Results.push_back(DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg)); } /// This function is responsible for legalizing a @@ -2449,9 +2491,13 @@ /// we promote it. At this point, we know that the result and operand types are /// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP /// operation that takes a larger input. -SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, - bool isSigned, - const SDLoc &dl) { +void SelectionDAGLegalize::PromoteLegalINT_TO_FP( + SDNode *N, const SDLoc &dl, SmallVectorImpl &Results) { + bool IsStrict = N->isStrictFPOpcode(); + bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP || + N->getOpcode() == ISD::STRICT_SINT_TO_FP; + EVT DestVT = N->getValueType(0); + SDValue LegalOp = N->getOperand(IsStrict ? 1 : 0); // First step, figure out the appropriate *INT_TO_FP operation to use. EVT NewInTy = LegalOp.getValueType(); @@ -2463,26 +2509,34 @@ assert(NewInTy.isInteger() && "Ran out of possibilities!"); // If the target supports SINT_TO_FP of this type, use it. - if (TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, NewInTy)) { - OpToUse = ISD::SINT_TO_FP; + OpToUse = IsStrict ? ISD::STRICT_SINT_TO_FP : ISD::SINT_TO_FP; + if (TLI.isOperationLegalOrCustom(OpToUse, NewInTy)) break; - } - if (isSigned) continue; + if (IsSigned) continue; // If the target supports UINT_TO_FP of this type, use it. - if (TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, NewInTy)) { - OpToUse = ISD::UINT_TO_FP; + OpToUse = IsStrict ? ISD::STRICT_UINT_TO_FP : ISD::UINT_TO_FP; + if (TLI.isOperationLegalOrCustom(OpToUse, NewInTy)) break; - } // Otherwise, try a larger type. } - // Okay, we found the operation and type to use. Zero extend our input to the + // Okay, we found the operation and type to use. Extend our input to the // desired type then run the operation on it. - return DAG.getNode(OpToUse, dl, DestVT, - DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, - dl, NewInTy, LegalOp)); + SDValue Ext = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl, + NewInTy, LegalOp); + + SDValue Operation; + if (IsStrict) + Operation = + DAG.getNode(OpToUse, dl, {DestVT, MVT::Other}, {N->getOperand(0), Ext}); + else + Operation = DAG.getNode(OpToUse, dl, DestVT, Ext); + + Results.push_back(Operation); + if (IsStrict) + Results.push_back(Operation.getValue(1)); } /// This function is responsible for legalizing a @@ -2898,15 +2952,20 @@ break; } case ISD::UINT_TO_FP: - if (TLI.expandUINT_TO_FP(Node, Tmp1, DAG)) { - Results.push_back(Tmp1); + case ISD::STRICT_UINT_TO_FP: + if (TLI.expandUINT_TO_FP(Node, Tmp1, Tmp2, DAG)) { + if (Node->isStrictFPOpcode()) { + Results.push_back(Tmp1); + Results.push_back(Tmp2); + LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_UINT_TO_FP node\n"); + } else + Results.push_back(Tmp1); break; } LLVM_FALLTHROUGH; case ISD::SINT_TO_FP: - Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP, - Node->getOperand(0), Node->getValueType(0), dl); - Results.push_back(Tmp1); + case ISD::STRICT_SINT_TO_FP: + ExpandLegalINT_TO_FP(Node, dl, Results); break; case ISD::FP_TO_SINT: if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG)) @@ -4173,10 +4232,12 @@ Node->getOpcode() == ISD::SINT_TO_FP || Node->getOpcode() == ISD::SETCC || Node->getOpcode() == ISD::EXTRACT_VECTOR_ELT || - Node->getOpcode() == ISD::INSERT_VECTOR_ELT) { + Node->getOpcode() == ISD::INSERT_VECTOR_ELT) OVT = Node->getOperand(0).getSimpleValueType(); - } - if (Node->getOpcode() == ISD::BR_CC) + else if (Node->getOpcode() == ISD::STRICT_UINT_TO_FP || + Node->getOpcode() == ISD::STRICT_SINT_TO_FP) + OVT = Node->getOperand(1).getSimpleValueType(); + else if (Node->getOpcode() == ISD::BR_CC) OVT = Node->getOperand(2).getSimpleValueType(); MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OVT); SDLoc dl(Node); @@ -4230,10 +4291,10 @@ PromoteLegalFP_TO_INT(Node, dl, Results); break; case ISD::UINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::SINT_TO_FP: - Tmp1 = PromoteLegalINT_TO_FP(Node->getOperand(0), Node->getValueType(0), - Node->getOpcode() == ISD::SINT_TO_FP, dl); - Results.push_back(Tmp1); + case ISD::STRICT_SINT_TO_FP: + PromoteLegalINT_TO_FP(Node, dl, Results); break; case ISD::VAARG: { SDValue Chain = Node->getOperand(0); // Get the chain. Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -307,13 +307,18 @@ return TranslateLegalizeResults(Op, Result); TargetLowering::LegalizeAction Action = TargetLowering::Legal; + EVT ValVT; switch (Op.getOpcode()) { default: return TranslateLegalizeResults(Op, Result); #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: #include "llvm/IR/ConstrainedOps.def" - Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); + ValVT = Node->getValueType(0); + if (Op.getOpcode() == ISD::STRICT_SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_UINT_TO_FP) + ValVT = Node->getOperand(1).getValueType(); + Action = TLI.getOperationAction(Node->getOpcode(), ValVT); // If we're asked to expand a strict vector floating-point operation, // by default we're going to simply unroll it. That is usually the // best approach, except in the case where the resulting strict (scalar) @@ -1153,17 +1158,27 @@ } SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) { - EVT VT = Op.getOperand(0).getValueType(); + bool IsStrict = Op.getNode()->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + EVT VT = Op.getOperand(OpNo).getValueType(); SDLoc DL(Op); // Attempt to expand using TargetLowering. SDValue Result; - if (TLI.expandUINT_TO_FP(Op.getNode(), Result, DAG)) + SDValue Chain; + if (TLI.expandUINT_TO_FP(Op.getNode(), Result, Chain, DAG)) { + if (IsStrict) + // Relink the chain + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Chain); return Result; + } // Make sure that the SINT_TO_FP and SRL instructions are available. - if (TLI.getOperationAction(ISD::SINT_TO_FP, VT) == TargetLowering::Expand || - TLI.getOperationAction(ISD::SRL, VT) == TargetLowering::Expand) + if (((!IsStrict && TLI.getOperationAction(ISD::SINT_TO_FP, VT) == + TargetLowering::Expand) || + (IsStrict && TLI.getOperationAction(ISD::STRICT_SINT_TO_FP, VT) == + TargetLowering::Expand)) || + TLI.getOperationAction(ISD::SRL, VT) == TargetLowering::Expand) return DAG.UnrollVectorOp(Op.getNode()); unsigned BW = VT.getScalarSizeInBits(); @@ -1185,6 +1200,29 @@ SDValue HI = DAG.getNode(ISD::SRL, DL, VT, Op.getOperand(0), HalfWord); SDValue LO = DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), HalfWordMask); + if (IsStrict) { + // Convert hi and lo to floats + // Convert the hi part back to the upper values + // TODO: Can any fast-math-flags be set on these nodes? + SDValue fHI = + DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {Op.getValueType(), MVT::Other}, + {Op.getOperand(0), HI}); + fHI = DAG.getNode(ISD::STRICT_FMUL, DL, {Op.getValueType(), MVT::Other}, + {SDValue(fHI.getNode(), 1), fHI, TWOHW}); + SDValue fLO = + DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {Op.getValueType(), MVT::Other}, + {SDValue(fHI.getNode(), 1), LO}); + + // Add the two halves + SDValue Result = + DAG.getNode(ISD::STRICT_FADD, DL, {Op.getValueType(), MVT::Other}, + {SDValue(fLO.getNode(), 1), fHI, fLO}); + + // Relink the chain + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), SDValue(Result.getNode(), 1)); + return Result; + } + // Convert hi and lo to floats // Convert the hi part back to the upper values // TODO: Can any fast-math-flags be set on these nodes? Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -587,6 +587,8 @@ case ISD::UINT_TO_FP: Res = ScalarizeVecOp_UnaryOp(N); break; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: Res = ScalarizeVecOp_UnaryOp_StrictFP(N); @@ -1261,6 +1263,8 @@ case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: SplitVecRes_UnaryOp(N, Lo, Hi); return; default: @@ -1977,9 +1981,12 @@ case ISD::VSELECT: Res = SplitVecOp_VSELECT(N, OpNo); break; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: - if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType())) + if (N->getValueType(0).bitsLT( + N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType())) Res = SplitVecOp_TruncateHelper(N); else Res = SplitVecOp_UnaryOp(N); @@ -2540,7 +2547,8 @@ // // Without this transform, the original truncate would end up being // scalarized, which is pretty much always a last resort. - SDValue InVec = N->getOperand(0); + unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0; + SDValue InVec = N->getOperand(OpNo); EVT InVT = InVec->getValueType(0); EVT OutVT = N->getValueType(0); unsigned NumElements = OutVT.getVectorNumElements(); @@ -2584,8 +2592,23 @@ EVT::getIntegerVT(*DAG.getContext(), InElementSize/2); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements/2); - SDValue HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec); - SDValue HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec); + + SDValue HalfLo; + SDValue HalfHi; + SDValue Chain; + if (N->isStrictFPOpcode()) { + HalfLo = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other}, + {N->getOperand(0), HalfLo}); + HalfHi = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other}, + {N->getOperand(0), HalfHi}); + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, HalfLo.getValue(1), + HalfHi.getValue(1)); + } else { + HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec); + HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec); + } // Concatenate them to get the full intermediate truncation result. EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements); SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo, @@ -2594,6 +2617,17 @@ // type. This should normally be something that ends up being legal directly, // but in theory if a target has very wide vectors and an annoyingly // restricted set of legal types, this split can chain to build things up. + + if (N->isStrictFPOpcode()) { + SDValue Res = DAG.getNode( + ISD::STRICT_FP_ROUND, DL, {OutVT, MVT::Other}, + {Chain, InterVec, + DAG.getTargetConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()))}); + // Relink the chain + ReplaceValueWith(SDValue(N, 1), SDValue(Res.getNode(), 1)); + return Res; + } + return IsFloat ? DAG.getNode(ISD::FP_ROUND, DL, OutVT, InterVec, DAG.getTargetConstant( @@ -3046,7 +3080,9 @@ case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: - return WidenVecRes_Convert_StrictFP(N); + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + return WidenVecRes_Convert_StrictFP(N); default: break; } @@ -4128,7 +4164,9 @@ case ISD::FP_TO_UINT: case ISD::STRICT_FP_TO_UINT: case ISD::SINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: case ISD::UINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::TRUNCATE: Res = WidenVecOp_Convert(N); break; Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1113,6 +1113,18 @@ : getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL)); } +std::pair +SelectionDAG::getStrictFPExtendOrRound(SDValue Op, SDValue Chain, + const SDLoc &DL, EVT VT) { + SDValue Res = + VT.bitsGT(Op.getValueType()) + ? getNode(ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other}, {Chain, Op}) + : getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other}, + {Chain, Op, getIntPtrConstant(0, DL)}); + + return std::pair(Res, SDValue(Res.getNode(), 1)); +} + SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::ANY_EXTEND, DL, VT, Op) : Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -324,7 +324,9 @@ case ISD::STRICT_FP_EXTEND: return "strict_fp_extend"; case ISD::SINT_TO_FP: return "sint_to_fp"; + case ISD::STRICT_SINT_TO_FP: return "strict_sint_to_fp"; case ISD::UINT_TO_FP: return "uint_to_fp"; + case ISD::STRICT_UINT_TO_FP: return "strict_uint_to_fp"; case ISD::FP_TO_SINT: return "fp_to_sint"; case ISD::STRICT_FP_TO_SINT: return "strict_fp_to_sint"; case ISD::FP_TO_UINT: return "fp_to_uint"; Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6104,8 +6104,10 @@ } bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, + SDValue &Chain, SelectionDAG &DAG) const { - SDValue Src = Node->getOperand(0); + unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0; + SDValue Src = Node->getOperand(OpNo); EVT SrcVT = Src.getValueType(); EVT DstVT = Node->getValueType(0); @@ -6128,7 +6130,13 @@ // For unsigned conversions, convert them to signed conversions using the // algorithm from the x86_64 __floatundidf in compiler_rt. - SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); + SDValue Fast; + if (Node->isStrictFPOpcode()) { + Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other}, + {Node->getOperand(0), Src}); + Chain = SDValue(Fast.getNode(), 1); + } else + Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT); SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Src, ShiftConst); @@ -6136,8 +6144,17 @@ SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Src, AndConst); SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr); - SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or); - SDValue Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt); + SDValue Slow; + if (Node->isStrictFPOpcode()) { + SDValue SignCvt = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, + {DstVT, MVT::Other}, {Chain, Or}); + Slow = DAG.getNode(ISD::STRICT_FADD, dl, {DstVT, MVT::Other}, + {SignCvt.getValue(1), SignCvt, SignCvt}); + Chain = Slow.getValue(1); + } else { + SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or); + Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt); + } // TODO: This really should be implemented using a branch rather than a // select. We happen to get lucky and machinesink does the right @@ -6180,8 +6197,18 @@ SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84); SDValue LoFlt = DAG.getBitcast(DstVT, LoOr); SDValue HiFlt = DAG.getBitcast(DstVT, HiOr); - SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); - Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); + if (Node->isStrictFPOpcode()) { + SDValue HiSub = + DAG.getNode(ISD::STRICT_FSUB, dl, {DstVT, MVT::Other}, + {Node->getOperand(0), HiFlt, TwoP84PlusTwoP52}); + Result = DAG.getNode(ISD::STRICT_FADD, dl, {DstVT, MVT::Other}, + {HiSub.getValue(1), LoFlt, HiSub}); + Chain = Result.getValue(1); + } else { + SDValue HiSub = + DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); + Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); + } return true; } Index: llvm/lib/IR/Verifier.cpp =================================================================== --- llvm/lib/IR/Verifier.cpp +++ llvm/lib/IR/Verifier.cpp @@ -4785,6 +4785,30 @@ } break; + case Intrinsic::experimental_constrained_sitofp: + case Intrinsic::experimental_constrained_uitofp: { + Assert((NumOperands == 3), "invalid arguments for constrained FP intrinsic", + &FPI); + Value *Operand = FPI.getArgOperand(0); + uint64_t NumSrcElem = 0; + Assert(Operand->getType()->isIntOrIntVectorTy(), + "Intrinsic first argument must be integer", &FPI); + if (auto *OperandT = dyn_cast(Operand->getType())) { + NumSrcElem = OperandT->getNumElements(); + } + + Operand = &FPI; + Assert((NumSrcElem > 0) == Operand->getType()->isVectorTy(), + "Intrinsic first argument and result disagree on vector use", &FPI); + Assert(Operand->getType()->isFPOrFPVectorTy(), + "Intrinsic result must be a floating point", &FPI); + if (auto *OperandT = dyn_cast(Operand->getType())) { + Assert(NumSrcElem == OperandT->getNumElements(), + "Intrinsic first argument and result vector lengths must be equal", + &FPI); + } + } break; + case Intrinsic::experimental_constrained_fptrunc: case Intrinsic::experimental_constrained_fpext: { Value *Operand = FPI.getArgOperand(0); Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -228,26 +228,34 @@ if (!Subtarget.useSoftFloat()) { // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. - setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. - setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote); // SSE has no i16 to fp conversion, only i32. We promote in the handler // to allow f80 to use i16 and f64 to use i16 with sse1 only - setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom); // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. - setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. @@ -989,9 +997,12 @@ setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); @@ -18412,8 +18423,13 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Op.getOpcode() == ISD::SINT_TO_FP || - Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"); - SDValue Src = Op.getOperand(0); + Op.getOpcode() == ISD::STRICT_SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_UINT_TO_FP || + Op.getOpcode() == ISD::UINT_TO_FP) && + "Unexpected opcode!"); + bool IsStrict = Op->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); @@ -18430,6 +18446,15 @@ SDLoc dl(Op); SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src); + if (IsStrict) { + SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other}, + {Op.getOperand(0), InVec}); + SDValue Chain = CvtVec.getValue(1); + SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, + DAG.getIntPtrConstant(0, dl)); + return DAG.getMergeValues({Value, Chain}, dl); + } + SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); @@ -18501,7 +18526,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); + bool IsStrict = Op->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -18510,7 +18537,8 @@ return Extract; if (SrcVT.isVector()) { - if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { + if (SrcVT == MVT::v2i32 && VT == MVT::v2f64 && + !IsStrict) { // FIXME:: Strict FP! return DAG.getNode(X86ISD::CVTSI2P, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))); @@ -18536,28 +18564,38 @@ // SSE doesn't have an i16 conversion so we need to promote. if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) { SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src); + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, + {Op.getOperand(0), Ext}); + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext); } if (VT == MVT::f128) return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); - SDValue ValueToStore = Op.getOperand(0); + SDValue ValueToStore = Src; if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + // FIXME:: Support strict FP! unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - SDValue Chain = DAG.getStore( - DAG.getEntryNode(), dl, ValueToStore, StackSlot, + SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode(); + Chain = DAG.getStore( + Chain, dl, ValueToStore, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); - return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG).first; + std::pair Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); + if (IsStrict) + return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); + + return Tmp.first; } std::pair X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, @@ -18645,6 +18683,8 @@ #endif */ + bool IsStrict = Op->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); @@ -18665,8 +18705,8 @@ SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); // Load the 64-bit value into an XMM register. - SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, - Op.getOperand(0)); + SDValue XR1 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), @@ -18679,32 +18719,51 @@ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /* Alignment = */ 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); + SDValue Sub; + SDValue Chain; // TODO: Are there any fast-math-flags to propagate here? - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + if (IsStrict) { + Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, + {Op.getOperand(0), XR2F, CLod1}); + Chain = SDValue(Sub.getNode(), 1); + } else + Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; - if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { + if (!IsStrict && Subtarget.hasSSE3() && + shouldUseHorizontalOp(true, DAG, Subtarget)) { + // FIXME: Do we need a STRICT version of FHADD? Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); - Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); + if (IsStrict) { + Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other}, + {Chain, Shuffle, Sub}); + Chain = Result.getValue(1); + } else + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); } - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, - DAG.getIntPtrConstant(0, dl)); + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, + DAG.getIntPtrConstant(0, dl)); + if (IsStrict) + return DAG.getMergeValues({Result, Chain}, dl); + + return Result; } /// 32-bit unsigned integer to float expansion. static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; SDLoc dl(Op); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // Load the 32-bit value into an XMM register. - SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, - Op.getOperand(0)); + SDValue Load = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo)); // Zero out the upper parts of the register. Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); @@ -18724,6 +18783,23 @@ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); + if (Op.getNode()->isStrictFPOpcode()) { + // Subtract the bias. + // TODO: Are there any fast-math-flags to propagate here? + SDValue Chain = Op.getOperand(0); + SDValue Res = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other}, + {Chain, Or, Bias}); + + // Handle final rounding. + if (Op.getSimpleValueType() == MVT::f64) + return Res; + + Chain = Res.getValue(1); + std::pair Tmp = + DAG.getStrictFPExtendOrRound(Res, Chain, dl, Op.getSimpleValueType()); + return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); + } + // Subtract the bias. // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); @@ -18738,6 +18814,10 @@ if (Op.getSimpleValueType() != MVT::v2f64) return SDValue(); + // FIXME:: Support strict FP! + if (Op.getNode()->isStrictFPOpcode()) + return SDValue(); + SDValue N0 = Op.getOperand(0); assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"); @@ -18864,7 +18944,8 @@ static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - SDValue N0 = Op.getOperand(0); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue N0 = Op.getOperand(OpNo); MVT SrcVT = N0.getSimpleValueType(); SDLoc dl(Op); @@ -18882,11 +18963,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue N0 = Op.getOperand(0); + bool IsStrict = Op->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); - MVT SrcVT = N0.getSimpleValueType(); + MVT SrcVT = Src.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); + SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); if (DstVT == MVT::f128) return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT)); @@ -18906,8 +18990,12 @@ // Promote i32 to i64 and use a signed conversion on 64-bit targets. if (SrcVT == MVT::i32 && Subtarget.is64Bit()) { - N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0); - return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0); + Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src); + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other}, + {Chain, Src}); + + return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); } if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) @@ -18924,22 +19012,28 @@ SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl); - SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, MachinePointerInfo()); + SDValue Store1 = + DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo()); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), OffsetSlot, MachinePointerInfo()); - return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG).first; + std::pair Tmp = + BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); + if (IsStrict) + return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); + + return Tmp.first; } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); - SDValue ValueToStore = Op.getOperand(0); - if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) + SDValue ValueToStore = Src; + if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) { // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, - MachinePointerInfo()); + } + SDValue Store = + DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo()); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, @@ -18954,13 +19048,14 @@ SDValue Ops[] = { Store, StackSlot }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); + Chain = Fild.getValue(1); APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. SDValue SignSet = DAG.getSetCC( dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), - Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); + Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. SDValue FudgePtr = DAG.getConstantPool( @@ -18975,11 +19070,19 @@ // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad( - ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, + ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, /* Alignment = */ 4); + Chain = Fudge.getValue(1); // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? + if (IsStrict) { + SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other}, + {Chain, Fild, Fudge}); + Chain = Add.getValue(1); + return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other}, + {Chain, Add, DAG.getIntPtrConstant(0, dl)}); + } SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); @@ -19033,10 +19136,7 @@ int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - if (IsStrict) - Chain = Op.getOperand(0); - else - Chain = DAG.getEntryNode(); + Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. @@ -27894,7 +27994,9 @@ case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::FSHL: case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); + case ISD::STRICT_SINT_TO_FP: case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::STRICT_UINT_TO_FP: case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); Index: llvm/test/CodeGen/X86/fp-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/fp-intrinsics.ll +++ llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -1950,6 +1950,762 @@ ret i64 %result } +; Verify that sitofp(%x) isn't simplified when the rounding mode is +; unknown. +; Verify that no gross errors happen. +define double @sifdb(i8 %x) #0 { +; X87-LABEL: sifdb: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X87-NEXT: filds {{[0-9]+}}(%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: sifdb: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cvtsi2sd %eax, %xmm0 +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: sifdb: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movsbl %dil, %eax +; SSE-NEXT: cvtsi2sd %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sifdb: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movsbl %dil, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i8(i8 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define double @sifdw(i16 %x) #0 { +; X87-LABEL: sifdw: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X87-NEXT: filds {{[0-9]+}}(%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: sifdw: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movswl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cvtsi2sd %eax, %xmm0 +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: sifdw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movswl %di, %eax +; SSE-NEXT: cvtsi2sd %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sifdw: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movswl %di, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i16(i16 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define double @sifdi(i32 %x) #0 { +; X87-LABEL: sifdi: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildl (%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: sifdi: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: cvtsi2sdl {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: sifdi: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsi2sd %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sifdi: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @siffb(i8 %x) #0 { +; X87-LABEL: siffb: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X87-NEXT: filds {{[0-9]+}}(%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: siffb: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cvtsi2ss %eax, %xmm0 +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: siffb: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movsbl %dil, %eax +; SSE-NEXT: cvtsi2ss %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: siffb: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movsbl %dil, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.sitofp.f32.i8(i8 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define float @siffw(i16 %x) #0 { +; X87-LABEL: siffw: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X87-NEXT: filds {{[0-9]+}}(%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: siffw: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: movswl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cvtsi2ss %eax, %xmm0 +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: siffw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movswl %di, %eax +; SSE-NEXT: cvtsi2ss %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: siffw: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movswl %di, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.sitofp.f32.i16(i16 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define float @siffi(i32 %x) #0 { +; X87-LABEL: siffi: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildl (%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: siffi: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: siffi: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsi2ss %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: siffi: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define double @sifdl(i64 %x) #0 { +; X87-LABEL: sifdl: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X87-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildll (%esp) +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: sifdl: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fstpl (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: sifdl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsi2sd %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sifdl: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @siffl(i64 %x) #0 { +; X87-LABEL: siffl: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X87-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildll (%esp) +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: siffl: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: flds {{[0-9]+}}(%esp) +; X86-SSE-NEXT: addl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: siffl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsi2ss %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: siffl: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.sitofp.f32.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +; Verify that uitofp(%x) isn't simplified when the rounding mode is +; unknown. +; Verify that no gross errors happen. +define double @uifdb(i8 %x) #0 { +; X87-LABEL: uifdb: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X87-NEXT: filds {{[0-9]+}}(%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uifdb: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cvtsi2sd %eax, %xmm0 +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uifdb: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: cvtsi2sd %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: uifdb: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i8(i8 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define double @uifdw(i16 %x) #0 { +; X87-LABEL: uifdw: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildl (%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uifdw: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cvtsi2sd %eax, %xmm0 +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uifdw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: cvtsi2sd %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: uifdw: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i16(i16 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define double @uifdi(i32 %x) #0 { +; X87-LABEL: uifdi: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: movl $0, {{[0-9]+}}(%esp) +; X87-NEXT: fildll (%esp) +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uifdi: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: orpd %xmm0, %xmm1 +; X86-SSE-NEXT: subsd %xmm0, %xmm1 +; X86-SSE-NEXT: movsd %xmm1, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uifdi: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: cvtsi2sd %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uifdi: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: uifdi: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define double @uifdl(i64 %x) #0 { +; X87-LABEL: uifdl: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $20, %esp +; X87-NEXT: .cfi_def_cfa_offset 24 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X87-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: xorl %eax, %eax +; X87-NEXT: testl %ecx, %ecx +; X87-NEXT: setns %al +; X87-NEXT: fildll (%esp) +; X87-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; X87-NEXT: fstpl {{[0-9]+}}(%esp) +; X87-NEXT: fldl {{[0-9]+}}(%esp) +; X87-NEXT: addl $20, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uifdl: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X86-SSE-NEXT: subpd {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: movapd %xmm0, %xmm1 +; X86-SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; X86-SSE-NEXT: addpd %xmm0, %xmm1 +; X86-SSE-NEXT: movlpd %xmm1, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uifdl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movq %rdi, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: subpd {{.*}}(%rip), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uifdl: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: uifdl: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @uiffb(i8 %x) #0 { +; X87-LABEL: uiffb: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X87-NEXT: filds {{[0-9]+}}(%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uiffb: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cvtsi2ss %eax, %xmm0 +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uiffb: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: cvtsi2ss %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: uiffb: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.uitofp.f32.i8(i8 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define float @uiffw(i16 %x) #0 { +; X87-LABEL: uiffw: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildl (%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uiffw: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cvtsi2ss %eax, %xmm0 +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uiffw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: cvtsi2ss %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: uiffw: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.uitofp.f32.i16(i16 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define float @uiffi(i32 %x) #0 { +; X87-LABEL: uiffi: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: movl $0, {{[0-9]+}}(%esp) +; X87-NEXT: fildll (%esp) +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uiffi: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: orpd %xmm0, %xmm1 +; X86-SSE-NEXT: subsd %xmm0, %xmm1 +; X86-SSE-NEXT: xorps %xmm0, %xmm0 +; X86-SSE-NEXT: cvtsd2ss %xmm1, %xmm0 +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uiffi: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: cvtsi2ss %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uiffi: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: uiffi: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define float @uiffl(i64 %x) #0 { +; X87-LABEL: uiffl: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $20, %esp +; X87-NEXT: .cfi_def_cfa_offset 24 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X87-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X87-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X87-NEXT: xorl %eax, %eax +; X87-NEXT: testl %ecx, %ecx +; X87-NEXT: setns %al +; X87-NEXT: fildll {{[0-9]+}}(%esp) +; X87-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; X87-NEXT: fstps {{[0-9]+}}(%esp) +; X87-NEXT: flds {{[0-9]+}}(%esp) +; X87-NEXT: addl $20, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uiffl: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: setns %al +; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: addl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uiffl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: js .LBB52_1 +; SSE-NEXT: # %bb.2: # %entry +; SSE-NEXT: cvtsi2ss %rdi, %xmm0 +; SSE-NEXT: retq +; SSE-NEXT: .LBB52_1: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: shrq %rax +; SSE-NEXT: andl $1, %edi +; SSE-NEXT: orq %rax, %rdi +; SSE-NEXT: cvtsi2ss %rdi, %xmm0 +; SSE-NEXT: addss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uiffl: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: testq %rdi, %rdi +; AVX1-NEXT: js .LBB52_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB52_1: +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: orq %rax, %rdi +; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: uiffl: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.uitofp.f32.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + attributes #0 = { strictfp } @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" @@ -1990,3 +2746,19 @@ declare i32 @llvm.experimental.constrained.lround.i32.f32(float, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f32(float, metadata) +declare float @llvm.experimental.constrained.sitofp.f32.i8(i8, metadata, metadata) +declare float @llvm.experimental.constrained.sitofp.f32.i16(i16, metadata, metadata) +declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata) +declare float @llvm.experimental.constrained.sitofp.f32.i64(i64, metadata, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i8(i8, metadata, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i16(i16, metadata, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i64(i64, metadata, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i8(i8, metadata, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i16(i16, metadata, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i32(i32, metadata, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i64(i64, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i8(i8, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i16(i16, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i64(i64, metadata, metadata) Index: llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -5690,6 +5690,1539 @@ ret <3 x double> %trunc } +define <1 x double> @constrained_vector_sitofp_v1f64_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f64_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2sd %edi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.sitofp.v1f64.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_sitofp_v1f32_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f32_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2ss %edi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.sitofp.v1f32.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <1 x double> @constrained_vector_sitofp_v1f64_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f64_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2sd %rdi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_sitofp_v1f32_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f32_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.sitofp.v1f32.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <2 x double> @constrained_vector_sitofp_v2f64_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f64_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %eax, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f64_v2i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vextractps $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_sitofp_v2f32_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f32_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %eax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f32_v2i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vextractps $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <2 x double> @constrained_vector_sitofp_v2f64_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f64_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f64_v2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_sitofp_v2f32_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f32_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f32_v2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %eax, %xmm0 +; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v3f64_v3i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vextractps $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm2, %xmm2 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm3, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ss %eax, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %eax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v3f32_v3i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vextractps $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm3, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <3 x double> @constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2sd %rdi, %xmm0 +; CHECK-NEXT: cvtsi2sd %rsi, %xmm1 +; CHECK-NEXT: cvtsi2sd %rdx, %xmm2 +; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_sitofp_v3f64_v3i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sitofp_v3f64_v3i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_sitofp_v3f32_v3i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sitofp_v3f32_v3i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <4 x double> @constrained_vector_sitofp_v4f64_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v4f64_v4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.sitofp.v4f64.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_sitofp_v4f32_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f32_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtdq2ps %xmm0, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v4f32_v4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.sitofp.v4f32.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + +define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: cvtsi2sd %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: cvtsi2sd %rax, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: movapd %xmm3, %xmm1 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_sitofp_v4f64_v4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sitofp_v4f64_v4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.sitofp.v4f64.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f32_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_sitofp_v4f32_v4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sitofp_v4f32_v4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + +define <1 x double> @constrained_vector_uitofp_v1f64_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.uitofp.v1f64.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_uitofp_v1f32_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f32_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v1f32_v1i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.uitofp.v1f32.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <1 x double> @constrained_vector_uitofp_v1f64_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: subpd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: addpd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_uitofp_v1f32_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f32_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: js .LBB170_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB170_1: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: orq %rax, %rdi +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v1f32_v1i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: testq %rdi, %rdi +; AVX1-NEXT: js .LBB170_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB170_1: +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: orq %rax, %rdi +; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.uitofp.v1f32.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <2 x double> @constrained_vector_uitofp_v2f64_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v2f64_v2i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm2, %xmm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.uitofp.v2f64.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_uitofp_v2f32_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f32_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.uitofp.v2f32.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: movapd {{.*#+}} xmm4 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm4, %xmm0 +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: addpd %xmm0, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-NEXT: subpd %xmm4, %xmm3 +; CHECK-NEXT: movapd %xmm3, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; CHECK-NEXT: addpd %xmm3, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] +; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vsubpd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v2f64_v2i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f32_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB174_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jns .LBB174_5 +; CHECK-NEXT: .LBB174_4: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB174_1: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB174_4 +; CHECK-NEXT: .LBB174_5: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB174_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: jns .LBB174_5 +; AVX1-NEXT: .LBB174_4: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB174_1: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB174_4 +; AVX1-NEXT: .LBB174_5: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.uitofp.v2f32.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm2, %xmm2 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.uitofp.v3f64.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.uitofp.v3f32.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm3, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: addpd %xmm1, %xmm0 +; CHECK-NEXT: movq %rsi, %xmm4 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-NEXT: subpd %xmm3, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm1 +; CHECK-NEXT: movq %rdx, %xmm4 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-NEXT: subpd %xmm3, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm2 +; CHECK-NEXT: movlpd %xmm2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] +; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-NEXT: vsubpd %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm4[1,0] +; AVX1-NEXT: vaddpd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vsubpd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.uitofp.v3f64.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: js .LBB178_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: jns .LBB178_5 +; CHECK-NEXT: .LBB178_4: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: orq %rax, %rdi +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: jns .LBB178_8 +; CHECK-NEXT: .LBB178_7: +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rax, %rdx +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB178_1: +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: orq %rax, %rsi +; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: js .LBB178_4 +; CHECK-NEXT: .LBB178_5: # %entry +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: js .LBB178_7 +; CHECK-NEXT: .LBB178_8: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB178_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: jns .LBB178_5 +; AVX1-NEXT: .LBB178_4: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: jmp .LBB178_6 +; AVX1-NEXT: .LBB178_1: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB178_4 +; AVX1-NEXT: .LBB178_5: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: .LBB178_6: # %entry +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB178_7 +; AVX1-NEXT: # %bb.8: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB178_7: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.uitofp.v3f32.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <4 x double> @constrained_vector_uitofp_v4f64_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractps $3, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vextractps $2, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v4f64_v4i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractps $3, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm1, %xmm1 +; AVX512-NEXT: vextractps $2, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm2, %xmm2 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm3, %xmm2 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm3, %xmm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.uitofp.v4f64.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_uitofp_v4f32_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f32_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v4f32_v4i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm3, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.uitofp.v4f32.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + +define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: movapd {{.*#+}} xmm5 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm5, %xmm2 +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; CHECK-NEXT: addpd %xmm2, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-NEXT: subpd %xmm5, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm2 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-NEXT: subpd %xmm5, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; CHECK-NEXT: addpd %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-NEXT: subpd %xmm5, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: movapd %xmm2, %xmm1 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503599627370496E+15,1.9342813113834067E+25] +; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] +; AVX1-NEXT: vaddpd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX1-NEXT: vaddpd %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] +; AVX1-NEXT: vaddpd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v4f64_v4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.uitofp.v4f64.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f32_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jns .LBB182_5 +; CHECK-NEXT: .LBB182_4: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm3 +; CHECK-NEXT: addss %xmm3, %xmm3 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jns .LBB182_8 +; CHECK-NEXT: .LBB182_7: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: jmp .LBB182_9 +; CHECK-NEXT: .LBB182_1: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: addss %xmm2, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_4 +; CHECK-NEXT: .LBB182_5: # %entry +; CHECK-NEXT: cvtsi2ss %rax, %xmm3 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_7 +; CHECK-NEXT: .LBB182_8: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: .LBB182_9: # %entry +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_10 +; CHECK-NEXT: # %bb.11: # %entry +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB182_10: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB182_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: jns .LBB182_5 +; AVX1-NEXT: .LBB182_4: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: jmp .LBB182_6 +; AVX1-NEXT: .LBB182_1: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB182_4 +; AVX1-NEXT: .LBB182_5: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: .LBB182_6: # %entry +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB182_7 +; AVX1-NEXT: # %bb.8: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: jns .LBB182_11 +; AVX1-NEXT: .LBB182_10: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB182_7: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB182_10 +; AVX1-NEXT: .LBB182_11: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v4f32_v4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + attributes #0 = { strictfp } ; Single width declarations @@ -5726,6 +7259,14 @@ declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.uitofp.v2f32.v2i32(<2 x i32>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.uitofp.v2f32.v2i64(<2 x i64>, metadata, metadata) ; Scalar width declarations declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata) @@ -5761,6 +7302,14 @@ declare <1 x float> @llvm.experimental.constrained.floor.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.round.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.trunc.v1f32(<1 x float>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i32(<1 x i32>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.sitofp.v1f32.v1i32(<1 x i32>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.sitofp.v1f32.v1i64(<1 x i64>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i32(<1 x i32>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.uitofp.v1f32.v1i32(<1 x i32>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.uitofp.v1f32.v1i64(<1 x i64>, metadata, metadata) ; Illegal width declarations declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata) @@ -5819,6 +7368,14 @@ declare <3 x double> @llvm.experimental.constrained.round.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.trunc.v3f32(<3 x float>, metadata, metadata) declare <3 x double> @llvm.experimental.constrained.trunc.v3f64(<3 x double>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.uitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.uitofp.v3f32.v3i32(<3 x i32>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.uitofp.v3f64.v3i64(<3 x i64>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.uitofp.v3f32.v3i64(<3 x i64>, metadata, metadata) ; Double width declarations declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata) @@ -5854,3 +7411,12 @@ declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.sitofp.v4f64.v4i32(<4 x i32>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i32(<4 x i32>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.sitofp.v4f64.v4i64(<4 x i64>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.uitofp.v4f64.v4i32(<4 x i32>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i32(<4 x i32>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.uitofp.v4f64.v4i64(<4 x i64>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64>, metadata, metadata) + Index: llvm/test/Feature/fp-intrinsics.ll =================================================================== --- llvm/test/Feature/fp-intrinsics.ll +++ llvm/test/Feature/fp-intrinsics.ll @@ -373,6 +373,28 @@ ret i64 %result } +; Verify that sitofp(42) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: @f30 +; CHECK: call double @llvm.experimental.constrained.sitofp +define double @f30() #0 { +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 42, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +; Verify that uitofp(42) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: @f31 +; CHECK: call double @llvm.experimental.constrained.uitofp +define double @f31() #0 { +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 42, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + attributes #0 = { strictfp } @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" @@ -405,3 +427,5 @@ declare i32 @llvm.experimental.constrained.lround.i32.f32(float, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f32(float, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata)