diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15553,6 +15553,76 @@ The result produced is a signed integer converted from the floating point operand. The value is truncated, so it is rounded towards zero. +'``llvm.experimental.constrained.uitofp``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.uitofp( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.uitofp``' intrinsic converts an +unsigned integer ``value`` to a floating-point of type ``ty2``. + +Arguments: +"""""""""" + +The first argument to the '``llvm.experimental.constrained.uitofp``' +intrinsic must be :ref:`floating point ` or :ref:`vector +` of integer values. An inexact floating-point exception +will be raised if rounding is required. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +The result produced is a floating point value. + +'``llvm.experimental.constrained.sitofp``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.sitofp( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.sitofp``' intrinsic converts a +signed integer ``value`` to a floating-point of type ``ty2``. + +Arguments: +"""""""""" + +The first argument to the '``llvm.experimental.constrained.sitofp``' +intrinsic must be :ref:`floating point ` or :ref:`vector +` of integer values. An inexact floating-point exception +will be raised if rounding is required. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +The result produced is a floating point value. + '``llvm.experimental.constrained.fptrunc``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -310,6 +310,13 @@ STRICT_FP_TO_SINT, STRICT_FP_TO_UINT, + /// STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to + /// a floating point value. These have the same semantics as sitofp and + /// uitofp in IR. + /// They are used to limit optimizations while the DAG is being optimized. + STRICT_SINT_TO_FP, + STRICT_UINT_TO_FP, + /// X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating /// point type down to the precision of the destination VT. TRUNC is a /// flag, which is always an integer that is zero or one. If TRUNC is 0, diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -803,6 +803,11 @@ /// float type VT, by either extending or rounding (by truncation). SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT); + /// Convert Op, which must be a STRICT operation of float type, to the + /// float type VT, by either extending or rounding (by truncation). + SDValue getStrictFPExtendOrRound(SDValue Op, const SDLoc &DL, + EVT VT); + /// Convert Op, which must be of integer type, to the /// integer type VT, by either any-extending or truncating it. SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT); diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -715,6 +715,8 @@ case ISD::STRICT_FTRUNC: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_EXTEND: return true; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -970,6 +970,8 @@ case ISD::STRICT_FTRUNC: EqOpc = ISD::FTRUNC; break; case ISD::STRICT_FP_TO_SINT: EqOpc = ISD::FP_TO_SINT; break; case ISD::STRICT_FP_TO_UINT: EqOpc = ISD::FP_TO_UINT; break; + case ISD::STRICT_SINT_TO_FP: EqOpc = ISD::SINT_TO_FP; break; + case ISD::STRICT_UINT_TO_FP: EqOpc = ISD::UINT_TO_FP; break; case ISD::STRICT_FP_ROUND: EqOpc = ISD::FP_ROUND; break; case ISD::STRICT_FP_EXTEND: EqOpc = ISD::FP_EXTEND; break; } @@ -4079,13 +4081,15 @@ /// \param N Node to expand /// \param Result output after conversion /// \returns True, if the expansion was successful, false otherwise - bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SDValue &Chain, SelectionDAG &DAG) const; + bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SDValue &Chain, + SelectionDAG &DAG) const; /// Expand UINT(i64) to double(f64) conversion /// \param N Node to expand /// \param Result output after conversion /// \returns True, if the expansion was successful, false otherwise - bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SDValue &Chain, + SelectionDAG &DAG) const; /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs. SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const; diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -261,6 +261,8 @@ case Intrinsic::experimental_constrained_fma: case Intrinsic::experimental_constrained_fptosi: case Intrinsic::experimental_constrained_fptoui: + case Intrinsic::experimental_constrained_sitofp: + case Intrinsic::experimental_constrained_uitofp: case Intrinsic::experimental_constrained_fptrunc: case Intrinsic::experimental_constrained_fpext: case Intrinsic::experimental_constrained_sqrt: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -640,6 +640,16 @@ [ llvm_anyfloat_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_sitofp : Intrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyint_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; + + def int_experimental_constrained_uitofp : Intrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyint_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_fptrunc : Intrinsic<[ llvm_anyfloat_ty ], [ llvm_anyfloat_ty, llvm_metadata_ty, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -172,7 +172,7 @@ SDValue NewIntValue) const; SDValue ExpandFCOPYSIGN(SDNode *Node) const; SDValue ExpandFABS(SDNode *Node) const; - SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, EVT DestVT, + SDValue ExpandLegalINT_TO_FP(bool isSigned, SDNode *Node, EVT DestVT, const SDLoc &dl); SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned, const SDLoc &dl); @@ -1016,6 +1016,14 @@ Action = TLI.getOperationAction(Node->getOpcode(), Node->getOperand(0).getValueType()); break; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + // These pseudo-ops are the same as the other STRICT_ ops except + // they are registered with setOperationAction() using the input type + // instead of the output type. + Action = TLI.getOperationAction(Node->getOpcode(), + Node->getOperand(1).getValueType()); + break; case ISD::STRICT_LRINT: case ISD::STRICT_LLRINT: case ISD::STRICT_LROUND: @@ -1237,6 +1245,15 @@ return; } + if (Node->isStrictFPOpcode() && !Res.getNode()->isStrictFPOpcode()) { + LLVM_DEBUG(dbgs() << "Successfully custom legalized strict node\n"); + // If a STRICT node gets lowered by the target and a non-STRICT + // node is returned we assume that the chain has already been + // handled. + ReplaceNodeWithValue(SDValue(Node, 0), Res); + return; + } + SmallVector ResultVals; for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) ResultVals.push_back(Res.getValue(i)); @@ -2347,9 +2364,11 @@ /// INT_TO_FP operation of the specified operand when the target requests that /// we expand it. At this point, we know that the result and operand types are /// legal for the target. -SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, +SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDNode *Node, EVT DestVT, const SDLoc &dl) { + unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0; + SDValue Op0 = Node->getOperand(OpNo); EVT SrcVT = Op0.getValueType(); // TODO: Should any fast-math-flags be set for the created nodes? @@ -2397,15 +2416,35 @@ BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // subtract the bias - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias); + SDValue Sub; + if (Node->isStrictFPOpcode()) { + Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other}, + {Node->getOperand(0), Load, Bias}); + } else + Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias); // final result - SDValue Result = DAG.getFPExtendOrRound(Sub, dl, DestVT); + SDValue Result; + if (Node->isStrictFPOpcode()) { + if (!DestVT.bitsEq(Sub.getValueType())) { + Result = DAG.getStrictFPExtendOrRound(Sub, dl, DestVT); + } + else + Result = Sub; + // Finally relink the chain + DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Result.getValue(1)); + } else + Result = DAG.getFPExtendOrRound(Sub, dl, DestVT); return Result; } assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet"); // Code below here assumes !isSigned without checking again. - SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0); + SDValue Tmp1; + if (Node->isStrictFPOpcode()) { + Tmp1 = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, { DestVT, MVT::Other }, + { Node->getOperand(0), Op0 }); + } else + Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0); SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(SrcVT), Op0, DAG.getConstant(0, dl, SrcVT), ISD::SETLT); @@ -2451,6 +2490,14 @@ FudgeInReg = Handle.getValue(); } + if (Node->isStrictFPOpcode()) { + SDValue Result = DAG.getNode(ISD::STRICT_FADD, dl, { DestVT, MVT::Other }, + { Tmp1.getValue(1), Tmp1, FudgeInReg }); + // Relink the chain + DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Result.getValue(1)); + return Result; + } + return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg); } @@ -2886,14 +2933,30 @@ break; } case ISD::UINT_TO_FP: - if (TLI.expandUINT_TO_FP(Node, Tmp1, DAG)) { - Results.push_back(Tmp1); + case ISD::STRICT_UINT_TO_FP: + if (TLI.expandUINT_TO_FP(Node, Tmp1, Tmp2, DAG)) { + if (Node->isStrictFPOpcode()) { + // Relink the chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Node,1), Tmp2); + // Replace the new UINT result. + ReplaceNodeWithValue(SDValue(Node, 0), Tmp1); + LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_UINT_TO_FP node\n"); + } else + Results.push_back(Tmp1); break; } LLVM_FALLTHROUGH; case ISD::SINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + if (Node->isStrictFPOpcode()) { + Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::STRICT_SINT_TO_FP, + Node, Node->getValueType(0), dl); + ReplaceNode(Node, Tmp1.getNode()); + LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_xINT_TO_FP node\n"); + return true; + } Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP, - Node->getOperand(0), Node->getValueType(0), dl); + Node, Node->getValueType(0), dl); Results.push_back(Tmp1); break; case ISD::FP_TO_SINT: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -307,6 +307,7 @@ return TranslateLegalizeResults(Op, Result); TargetLowering::LegalizeAction Action = TargetLowering::Legal; + EVT ValVT = MVT::Other; switch (Op.getOpcode()) { default: return TranslateLegalizeResults(Op, Result); @@ -338,17 +339,22 @@ case ISD::STRICT_FP_TO_UINT: case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_EXTEND: - Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); + ValVT = Node->getValueType(0); + LLVM_FALLTHROUGH; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + if (ValVT == MVT::Other) + ValVT = Node->getOperand(1).getValueType(); + Action = TLI.getOperationAction(Node->getOpcode(), ValVT); // If we're asked to expand a strict vector floating-point operation, // by default we're going to simply unroll it. That is usually the // best approach, except in the case where the resulting strict (scalar) // operations would themselves use the fallback mutation to non-strict. // In that specific case, just do the fallback on the vector op. if (Action == TargetLowering::Expand && - TLI.getStrictFPOperationAction(Node->getOpcode(), - Node->getValueType(0)) - == TargetLowering::Legal) { - EVT EltVT = Node->getValueType(0).getVectorElementType(); + TLI.getStrictFPOperationAction(Node->getOpcode(), ValVT) == + TargetLowering::Legal) { + EVT EltVT = ValVT.getVectorElementType(); if (TLI.getOperationAction(Node->getOpcode(), EltVT) == TargetLowering::Expand && TLI.getStrictFPOperationAction(Node->getOpcode(), EltVT) @@ -1201,17 +1207,27 @@ } SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) { - EVT VT = Op.getOperand(0).getValueType(); + bool IsStrict = Op.getNode()->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + EVT VT = Op.getOperand(OpNo).getValueType(); SDLoc DL(Op); // Attempt to expand using TargetLowering. SDValue Result; - if (TLI.expandUINT_TO_FP(Op.getNode(), Result, DAG)) + SDValue Chain; + if (TLI.expandUINT_TO_FP(Op.getNode(), Result, Chain, DAG)) { + if (IsStrict) + // Relink the chain + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Chain); return Result; + } // Make sure that the SINT_TO_FP and SRL instructions are available. - if (TLI.getOperationAction(ISD::SINT_TO_FP, VT) == TargetLowering::Expand || - TLI.getOperationAction(ISD::SRL, VT) == TargetLowering::Expand) + if (((!IsStrict && TLI.getOperationAction(ISD::SINT_TO_FP, VT) == + TargetLowering::Expand) || + (IsStrict && TLI.getOperationAction(ISD::STRICT_SINT_TO_FP, VT) == + TargetLowering::Expand)) || + TLI.getOperationAction(ISD::SRL, VT) == TargetLowering::Expand) return DAG.UnrollVectorOp(Op.getNode()); unsigned BW = VT.getScalarSizeInBits(); @@ -1233,6 +1249,29 @@ SDValue HI = DAG.getNode(ISD::SRL, DL, VT, Op.getOperand(0), HalfWord); SDValue LO = DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), HalfWordMask); + if (IsStrict) { + // Convert hi and lo to floats + // Convert the hi part back to the upper values + // TODO: Can any fast-math-flags be set on these nodes? + SDValue fHI = + DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {Op.getValueType(), MVT::Other}, + {Op.getOperand(0), HI}); + fHI = DAG.getNode(ISD::STRICT_FMUL, DL, {Op.getValueType(), MVT::Other}, + {SDValue(fHI.getNode(), 1), fHI, TWOHW}); + SDValue fLO = + DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {Op.getValueType(), MVT::Other}, + {SDValue(fHI.getNode(), 1), LO}); + + // Add the two halves + SDValue Result = + DAG.getNode(ISD::STRICT_FADD, DL, {Op.getValueType(), MVT::Other}, + {SDValue(fLO.getNode(), 1), fHI, fLO}); + + // Relink the chain + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), SDValue(Result.getNode(), 1)); + return Result; + } + // Convert hi and lo to floats // Convert the hi part back to the upper values // TODO: Can any fast-math-flags be set on these nodes? diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -172,6 +172,8 @@ case ISD::STRICT_FTRUNC: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_UINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: case ISD::STRICT_FP_EXTEND: R = ScalarizeVecRes_StrictFPOp(N); break; @@ -606,6 +608,8 @@ case ISD::UINT_TO_FP: Res = ScalarizeVecOp_UnaryOp(N); break; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: Res = ScalarizeVecOp_UnaryOp_StrictFP(N); @@ -914,8 +918,10 @@ case ISD::FSQRT: case ISD::FTRUNC: case ISD::SINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: case ISD::TRUNCATE: case ISD::UINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::FCANONICALIZE: SplitVecRes_UnaryOp(N, Lo, Hi); break; @@ -2003,9 +2009,12 @@ case ISD::VSELECT: Res = SplitVecOp_VSELECT(N, OpNo); break; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: - if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType())) + if (N->getValueType(0).bitsLT( + N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType())) Res = SplitVecOp_TruncateHelper(N); else Res = SplitVecOp_UnaryOp(N); @@ -2562,7 +2571,8 @@ // // Without this transform, the original truncate would end up being // scalarized, which is pretty much always a last resort. - SDValue InVec = N->getOperand(0); + unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0; + SDValue InVec = N->getOperand(OpNo); EVT InVT = InVec->getValueType(0); EVT OutVT = N->getValueType(0); unsigned NumElements = OutVT.getVectorNumElements(); @@ -2606,8 +2616,23 @@ EVT::getIntegerVT(*DAG.getContext(), InElementSize/2); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements/2); - SDValue HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec); - SDValue HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec); + + SDValue HalfLo; + SDValue HalfHi; + SDValue Chain; + if (N->isStrictFPOpcode()) { + HalfLo = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other}, + {N->getOperand(0), HalfLo}); + HalfHi = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other}, + {N->getOperand(0), HalfHi}); + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, HalfLo.getValue(1), + HalfHi.getValue(1)); + } else { + HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec); + HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec); + } // Concatenate them to get the full intermediate truncation result. EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements); SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo, @@ -2616,6 +2641,17 @@ // type. This should normally be something that ends up being legal directly, // but in theory if a target has very wide vectors and an annoyingly // restricted set of legal types, this split can chain to build things up. + + if (N->isStrictFPOpcode()) { + SDValue Res = DAG.getNode( + ISD::STRICT_FP_ROUND, DL, {OutVT, MVT::Other}, + {Chain, InterVec, + DAG.getTargetConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()))}); + // Relink the chain + ReplaceValueWith(SDValue(N, 1), SDValue(Res.getNode(), 1)); + return Res; + } + return IsFloat ? DAG.getNode(ISD::FP_ROUND, DL, OutVT, InterVec, DAG.getTargetConstant( @@ -2847,6 +2883,8 @@ case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_UINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: Res = WidenVecRes_Convert_StrictFP(N); break; @@ -4168,7 +4206,9 @@ case ISD::FP_TO_UINT: case ISD::STRICT_FP_TO_UINT: case ISD::SINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: case ISD::UINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::TRUNCATE: Res = WidenVecOp_Convert(N); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1101,6 +1101,16 @@ : getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL)); } +SDValue SelectionDAG::getStrictFPExtendOrRound(SDValue Op, const SDLoc &DL, + EVT VT) { + SDValue Chain = SDValue(Op.getNode(),1); + + return VT.bitsGT(Op.getValueType()) + ? getNode(ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other}, {Chain, Op}) + : getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other}, + {Chain, Op, getIntPtrConstant(0, DL)}); +} + SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::ANY_EXTEND, DL, VT, Op) : @@ -7783,6 +7793,8 @@ case ISD::STRICT_FP_EXTEND: NewOpc = ISD::FP_EXTEND; break; case ISD::STRICT_FP_TO_SINT: NewOpc = ISD::FP_TO_SINT; break; case ISD::STRICT_FP_TO_UINT: NewOpc = ISD::FP_TO_UINT; break; + case ISD::STRICT_SINT_TO_FP: NewOpc = ISD::SINT_TO_FP; break; + case ISD::STRICT_UINT_TO_FP: NewOpc = ISD::UINT_TO_FP; break; } assert(Node->getNumValues() == 2 && "Unexpected number of results!"); @@ -9108,7 +9120,8 @@ } SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { - assert(N->getNumValues() == 1 && + assert((N->getNumValues() == 1 || + (N->getNumValues() == 2 && N->getValueType(1) == MVT::Other)) && "Can't unroll a vector with multiple results!"); EVT VT = N->getValueType(0); @@ -9118,6 +9131,9 @@ SmallVector Scalars; SmallVector Operands(N->getNumOperands()); + SmallVector EltVTs = {EltVT}; + if (N->getNumValues() == 2) + EltVTs.push_back(MVT::Other); // If ResNE is 0, fully unroll the vector op. if (ResNE == 0) @@ -9144,8 +9160,11 @@ switch (N->getOpcode()) { default: { - Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands, - N->getFlags())); + // Use EltVTs here since the node may be chained. Singular EltVT + // is used for cases where we know there is no chain. + SDValue Scalar = getNode(N->getOpcode(), dl, EltVTs, Operands); + Scalar.getNode()->setFlags(N->getFlags()); + Scalars.push_back(Scalar); break; } case ISD::VSELECT: @@ -9169,6 +9188,16 @@ } } + if (N->getNumValues() == 2) { + SmallVector Chains; + for (unsigned i = 0; i < Scalars.size(); i++) { + Chains.push_back(SDValue(Scalars[i].getNode(), 1)); + } + // Build a new factor node to connect the chain back together. + SDValue OutChain = getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + ReplaceAllUsesOfValueWith(SDValue(N, 1), OutChain); + } + for (; i < ResNE; ++i) Scalars.push_back(getUNDEF(EltVT)); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6128,6 +6128,8 @@ case Intrinsic::experimental_constrained_fma: case Intrinsic::experimental_constrained_fptosi: case Intrinsic::experimental_constrained_fptoui: + case Intrinsic::experimental_constrained_sitofp: + case Intrinsic::experimental_constrained_uitofp: case Intrinsic::experimental_constrained_fptrunc: case Intrinsic::experimental_constrained_fpext: case Intrinsic::experimental_constrained_sqrt: @@ -6941,6 +6943,12 @@ case Intrinsic::experimental_constrained_fptoui: Opcode = ISD::STRICT_FP_TO_UINT; break; + case Intrinsic::experimental_constrained_sitofp: + Opcode = ISD::STRICT_SINT_TO_FP; + break; + case Intrinsic::experimental_constrained_uitofp: + Opcode = ISD::STRICT_UINT_TO_FP; + break; case Intrinsic::experimental_constrained_fptrunc: Opcode = ISD::STRICT_FP_ROUND; Opers.push_back(DAG.getTargetConstant(0, sdl, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -324,7 +324,9 @@ case ISD::STRICT_FP_EXTEND: return "strict_fp_extend"; case ISD::SINT_TO_FP: return "sint_to_fp"; + case ISD::STRICT_SINT_TO_FP: return "strict_sint_to_fp"; case ISD::UINT_TO_FP: return "uint_to_fp"; + case ISD::STRICT_UINT_TO_FP: return "strict_uint_to_fp"; case ISD::FP_TO_SINT: return "fp_to_sint"; case ISD::STRICT_FP_TO_SINT: return "strict_fp_to_sint"; case ISD::FP_TO_UINT: return "fp_to_uint"; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6033,8 +6033,10 @@ } bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, + SDValue &Chain, SelectionDAG &DAG) const { - SDValue Src = Node->getOperand(0); + unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0; + SDValue Src = Node->getOperand(OpNo); EVT SrcVT = Src.getValueType(); EVT DstVT = Node->getValueType(0); @@ -6065,8 +6067,18 @@ SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Src, AndConst); SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr); - SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or); - SDValue Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt); + SDValue Slow; + if (Node->isStrictFPOpcode()) { + SDValue SignCvt = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, + { DstVT, MVT::Other }, + { Node->getOperand(0), Or }); + Slow = DAG.getNode(ISD::STRICT_FADD, dl, { DstVT, MVT::Other }, + { SignCvt.getValue(1), SignCvt, SignCvt }); + Chain = Slow.getValue(1); + } else { + SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or); + Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt); + } // TODO: This really should be implemented using a branch rather than a // select. We happen to get lucky and machinesink does the right @@ -6109,8 +6121,18 @@ SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84); SDValue LoFlt = DAG.getBitcast(DstVT, LoOr); SDValue HiFlt = DAG.getBitcast(DstVT, HiOr); - SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); - Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); + if (Node->isStrictFPOpcode()) { + SDValue HiSub = + DAG.getNode(ISD::STRICT_FSUB, dl, {DstVT, MVT::Other}, + {Node->getOperand(0), HiFlt, TwoP84PlusTwoP52}); + Result = DAG.getNode(ISD::STRICT_FADD, dl, {DstVT, MVT::Other}, + {HiSub.getValue(1), LoFlt, HiSub}); + Chain = Result.getValue(1); + } else { + SDValue HiSub = + DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); + Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); + } return true; } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -726,6 +726,8 @@ setOperationAction(ISD::STRICT_FP_EXTEND, VT, Expand); setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Expand); setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Expand); + setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Expand); + setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Expand); // For most targets @llvm.get.dynamic.area.offset just returns 0. setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand); diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -190,6 +190,8 @@ return false; case Intrinsic::experimental_constrained_fptosi: case Intrinsic::experimental_constrained_fptoui: + case Intrinsic::experimental_constrained_sitofp: + case Intrinsic::experimental_constrained_uitofp: case Intrinsic::experimental_constrained_fptrunc: case Intrinsic::experimental_constrained_fpext: case Intrinsic::experimental_constrained_sqrt: diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -4306,6 +4306,8 @@ case Intrinsic::experimental_constrained_fdiv: case Intrinsic::experimental_constrained_frem: case Intrinsic::experimental_constrained_fma: + case Intrinsic::experimental_constrained_sitofp: + case Intrinsic::experimental_constrained_uitofp: case Intrinsic::experimental_constrained_fptosi: case Intrinsic::experimental_constrained_fptoui: case Intrinsic::experimental_constrained_fptrunc: @@ -4856,6 +4858,33 @@ } break; + case Intrinsic::experimental_constrained_sitofp: + case Intrinsic::experimental_constrained_uitofp: { + Assert((NumOperands == 3), "invalid arguments for constrained FP intrinsic", + &FPI); + HasExceptionMD = true; + HasRoundingMD = true; + + Value *Operand = FPI.getArgOperand(0); + uint64_t NumSrcElem = 0; + Assert(Operand->getType()->isIntOrIntVectorTy(), + "Intrinsic first argument must be integer", &FPI); + if (auto *OperandT = dyn_cast(Operand->getType())) { + NumSrcElem = OperandT->getNumElements(); + } + + Operand = &FPI; + Assert((NumSrcElem > 0) == Operand->getType()->isVectorTy(), + "Intrinsic first argument and result disagree on vector use", &FPI); + Assert(Operand->getType()->isFPOrFPVectorTy(), + "Intrinsic result must be a floating point", &FPI); + if (auto *OperandT = dyn_cast(Operand->getType())) { + Assert(NumSrcElem == OperandT->getNumElements(), + "Intrinsic first argument and result vector lengths must be equal", + &FPI); + } + } break; + case Intrinsic::experimental_constrained_fptrunc: case Intrinsic::experimental_constrained_fpext: { if (FPI.getIntrinsicID() == Intrinsic::experimental_constrained_fptrunc) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -233,9 +233,11 @@ // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); } else { setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand); } @@ -251,6 +253,7 @@ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); @@ -270,6 +273,7 @@ // are Legal, f80 is custom lowered. setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); @@ -961,9 +965,12 @@ setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); @@ -18333,8 +18340,12 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Op.getOpcode() == ISD::SINT_TO_FP || - Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"); - SDValue Src = Op.getOperand(0); + Op.getOpcode() == ISD::STRICT_SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_UINT_TO_FP || + Op.getOpcode() == ISD::UINT_TO_FP) && + "Unexpected opcode!"); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); @@ -18351,7 +18362,16 @@ SDLoc dl(Op); SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src); - SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec); + SDValue CvtVec; + if (Op.getNode()->isStrictFPOpcode()) { + CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other}, + {Op.getOperand(0), InVec}); + // Relink the chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 1), + SDValue(CvtVec.getNode(), 1)); + } else + CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); } @@ -18422,7 +18442,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -18558,6 +18579,7 @@ #endif */ + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); @@ -18578,8 +18600,8 @@ SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); // Load the 64-bit value into an XMM register. - SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, - Op.getOperand(0)); + SDValue XR1 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), @@ -18592,15 +18614,34 @@ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /* Alignment = */ 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); + SDValue Sub; // TODO: Are there any fast-math-flags to propagate here? - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + if (Op.getNode()->isStrictFPOpcode()) { + Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, + {Op.getOperand(0), XR2F, CLod1}); + } else + Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { + // FIXME: Do we need a STRICT version of FHADD? Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); + + if (Op.getNode()->isStrictFPOpcode()) { + // Relink the chain that wasn't relinked earlier. + DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 1), + SDValue(Sub.getNode(), 1)); + } } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); - Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); + if (Op.getNode()->isStrictFPOpcode()) { + Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other}, + {SDValue(Sub.getNode(), 1), Shuffle, Sub}); + // Relink the chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 1), + SDValue(Result.getNode(), 1)); + } else + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); } return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, @@ -18795,7 +18836,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue N0 = Op.getOperand(0); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue N0 = Op.getOperand(OpNo); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); MVT SrcVT = N0.getSimpleValueType(); @@ -18820,6 +18862,14 @@ // Promote i32 to i64 and use a signed conversion on 64-bit targets. if (SrcVT == MVT::i32 && Subtarget.is64Bit()) { N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0); + if (Op.getNode()->isStrictFPOpcode()) { + SDValue CvtVec = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, + {DstVT, MVT::Other}, {Op.getOperand(0), N0}); + // Relink the chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 1), + SDValue(CvtVec.getNode(), 1)); + return CvtVec; + } return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0); } @@ -18846,7 +18896,7 @@ } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); - SDValue ValueToStore = Op.getOperand(0); + SDValue ValueToStore = Op.getOperand(OpNo); if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come @@ -27714,7 +27764,9 @@ case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::FSHL: case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); + case ISD::STRICT_SINT_TO_FP: case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::STRICT_UINT_TO_FP: case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); @@ -27814,6 +27866,11 @@ void X86TargetLowering::LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { + // TODO: This is only needed because we can't handle replacing a chained + // TODO: node with an unchained one here. Remove when this is fixed. + if (N->isStrictFPOpcode()) + return; + SDValue Res = LowerOperation(SDValue(N, 0), DAG); if (!Res.getNode()) diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -374,7 +374,7 @@ ; CHECK-LABEL: f26 ; COMMON: jmp llrintf -define i64 @f26(float %x) { +define i64 @f26(float %x) #0 { entry: %result = call i64 @llvm.experimental.constrained.llrint.i64.f32(float %x, metadata !"round.dynamic", @@ -418,6 +418,127 @@ ret i64 %result } +; Verify that sitofp(%x) isn't simplified when the rounding mode is +; unknown. The expansion should have only one conversion instruction. +; Verify that no gross errors happen. +define double @sifdi(i32 %x) #0 { +; NO-FMA-LABEL: sifdi: +; NO-FMA: cvtsi2sd +; +; HAS-FMA-LABEL: sifdi: +; HAS-FMA: vcvtsi2sd +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @siffi(i32 %x) #0 { +; NO-FMA-LABEL: siffi: +; NO-FMA: cvtsi2ss +; +; HAS-FMA-LABEL: siffi: +; HAS-FMA: vcvtsi2ss +entry: + %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define double @sifdl(i64 %x) #0 { +; NO-FMA-LABEL: sifdl: +; NO-FMA: cvtsi2sd +; +; HAS-FMA-LABEL: sifdl: +; HAS-FMA: vcvtsi2sd +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @siffl(i64 %x) #0 { +; NO-FMA-LABEL: siffl: +; NO-FMA: cvtsi2ss +; +; HAS-FMA-LABEL: siffl: +; HAS-FMA: vcvtsi2ss +entry: + %result = call float @llvm.experimental.constrained.sitofp.f32.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +; Verify that uitofp(%x) isn't simplified when the rounding mode is +; unknown. Expansions from i32 should have only one conversion instruction. +; Verify that no gross errors happen. +define double @uifdi(i32 %x) #0 { +; NO-FMA-LABEL: uifdi: +; NO-FMA: cvtsi2sd +; +; HAS-FMA-LABEL: uifdi: +; HAS-FMA: vcvtsi2sd +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define double @uifdl(i64 %x) #0 { +; NO-FMA-LABEL: uifdl: +; NO-FMA: movq +; NO-FMA-NEXT: punpckldq +; NO-FMA-NEXT: subpd +; NO-FMA-NEXT: movapd +; NO-FMA-NEXT: unpckhpd +; NO-FMA-NEXT: addpd +; +; HAS-FMA-LABEL: uifdl: +; HAS-FMA: vmovq +; HAS-FMA-NEXT: vpunpckldq +; HAS-FMA-NEXT: vsubpd +; HAS-FMA-NEXT: vpermilpd +; HAS-FMA-NEXT: vaddpd +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @uiffi(i32 %x) #0 { +; NO-FMA-LABEL: uiffi: +; NO-FMA: cvtsi2ss %rax, %xmm0 +; +; HAS-FMA-LABEL: uiffi: +; HAS-FMA: vcvtsi2ss %rax, %xmm0, %xmm0 +entry: + %result = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define float @uiffl(i64 %x) #0 { +; NO-FMA-LABEL: uiffl: +; NO-FMA: cvtsi2ss +; NO-FMA: cvtsi2ss +; +; HAS-FMA-LABEL: uiffl: +; HAS-FMA: vcvtsi2ss +; HAS-FMA: vcvtsi2ss +entry: + %result = call float @llvm.experimental.constrained.uitofp.f32.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + attributes #0 = { strictfp } @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" @@ -452,3 +573,11 @@ declare i32 @llvm.experimental.constrained.lround.i32.f32(float, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f32(float, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) +declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i64(i64, metadata, metadata) +declare float @llvm.experimental.constrained.sitofp.f32.i64(i64, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i64(i64, metadata, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i32(i32, metadata, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i64(i64, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -5490,6 +5490,1309 @@ ret <3 x double> %trunc } +define <1 x double> @constrained_vector_sitofp_v1f64_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f64_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2sd %edi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.sitofp.v1f64.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_sitofp_v1f32_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f32_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2ss %edi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.sitofp.v1f32.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <1 x double> @constrained_vector_sitofp_v1f64_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f64_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2sd %rdi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_sitofp_v1f32_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f32_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.sitofp.v1f32.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <2 x double> @constrained_vector_sitofp_v2f64_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f64_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %eax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f64_v2i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_sitofp_v2f32_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f32_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %eax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f32_v2i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <2 x double> @constrained_vector_sitofp_v2f64_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f64_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f64_v2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_sitofp_v2f32_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f32_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f32_v2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %eax, %xmm0 +; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v3f64_v3i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm2, %xmm2 +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm3, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ss %eax, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %eax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v3f32_v3i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm3, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <3 x double> @constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2sd %rdi, %xmm0 +; CHECK-NEXT: cvtsi2sd %rsi, %xmm1 +; CHECK-NEXT: cvtsi2sd %rdx, %xmm2 +; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v3f64_v3i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v3f32_v3i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <4 x double> @constrained_vector_sitofp_v4f64_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v4f64_v4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.sitofp.v4f64.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_sitofp_v4f32_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f32_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtdq2ps %xmm0, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v4f32_v4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.sitofp.v4f32.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + +define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: cvtsi2sd %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: cvtsi2sd %rax, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: movaps %xmm3, %xmm1 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v4f64_v4i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX-NEXT: vmovq %xmm1, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.sitofp.v4f64.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f32_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v4f32_v4i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + +define <1 x double> @constrained_vector_uitofp_v1f64_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v1f64_v1i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.uitofp.v1f64.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_uitofp_v1f32_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f32_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v1f32_v1i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.uitofp.v1f32.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <1 x double> @constrained_vector_uitofp_v1f64_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: subpd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: addpd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v1f64_v1i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovq %rdi, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_uitofp_v1f32_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f32_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: js .LBB170_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB170_1: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: orq %rax, %rdi +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v1f32_v1i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: testq %rdi, %rdi +; AVX-NEXT: js .LBB170_1 +; AVX-NEXT: # %bb.2: # %entry +; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +; AVX-NEXT: .LBB170_1: +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: shrq %rax +; AVX-NEXT: andl $1, %edi +; AVX-NEXT: orq %rax, %rdi +; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.uitofp.v1f32.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <2 x double> @constrained_vector_uitofp_v2f64_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v2f64_v2i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.uitofp.v2f64.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_uitofp_v2f32_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f32_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v2f32_v2i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.uitofp.v2f32.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: movapd {{.*#+}} xmm4 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm4, %xmm0 +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: addpd %xmm0, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-NEXT: subpd %xmm4, %xmm3 +; CHECK-NEXT: movapd %xmm3, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; CHECK-NEXT: addpd %xmm3, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v2f64_v2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0] +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] +; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX-NEXT: vaddpd %xmm2, %xmm4, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f32_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB174_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jns .LBB174_5 +; CHECK-NEXT: .LBB174_4: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB174_1: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB174_4 +; CHECK-NEXT: .LBB174_5: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v2f32_v2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: js .LBB174_1 +; AVX-NEXT: # %bb.2: # %entry +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: jns .LBB174_5 +; AVX-NEXT: .LBB174_4: +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq %rcx +; AVX-NEXT: andl $1, %eax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: retq +; AVX-NEXT: .LBB174_1: +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq %rcx +; AVX-NEXT: andl $1, %eax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: js .LBB174_4 +; AVX-NEXT: .LBB174_5: # %entry +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.uitofp.v2f32.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v3f64_v3i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.uitofp.v3f64.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v3f32_v3i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.uitofp.v3f32.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm3, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: addpd %xmm1, %xmm0 +; CHECK-NEXT: movq %rsi, %xmm4 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-NEXT: subpd %xmm3, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm1 +; CHECK-NEXT: movq %rdx, %xmm4 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-NEXT: subpd %xmm3, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm2 +; CHECK-NEXT: movlpd %xmm2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v3f64_v3i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0] +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] +; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX-NEXT: vaddpd %xmm2, %xmm4, %xmm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX-NEXT: vsubpd %xmm3, %xmm4, %xmm4 +; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm4[1,0] +; AVX-NEXT: vaddpd %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.uitofp.v3f64.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: js .LBB178_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: jns .LBB178_5 +; CHECK-NEXT: .LBB178_4: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: orq %rax, %rdi +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: jns .LBB178_8 +; CHECK-NEXT: .LBB178_7: +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rax, %rdx +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB178_1: +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: orq %rax, %rsi +; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: js .LBB178_4 +; CHECK-NEXT: .LBB178_5: # %entry +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: js .LBB178_7 +; CHECK-NEXT: .LBB178_8: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v3f32_v3i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: js .LBB178_1 +; AVX-NEXT: # %bb.2: # %entry +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: jns .LBB178_5 +; AVX-NEXT: .LBB178_4: +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq %rcx +; AVX-NEXT: andl $1, %eax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX-NEXT: jmp .LBB178_6 +; AVX-NEXT: .LBB178_1: +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq %rcx +; AVX-NEXT: andl $1, %eax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: js .LBB178_4 +; AVX-NEXT: .LBB178_5: # %entry +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX-NEXT: .LBB178_6: # %entry +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: js .LBB178_7 +; AVX-NEXT: # %bb.8: # %entry +; AVX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; AVX-NEXT: .LBB178_7: +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq %rcx +; AVX-NEXT: andl $1, %eax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.uitofp.v3f32.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <4 x double> @constrained_vector_uitofp_v4f64_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v4f64_v4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $3, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.uitofp.v4f64.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_uitofp_v4f32_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f32_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v4f32_v4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX-NEXT: vpextrd $3, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.uitofp.v4f32.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + +define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: movapd {{.*#+}} xmm5 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm5, %xmm2 +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; CHECK-NEXT: addpd %xmm2, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-NEXT: subpd %xmm5, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm2 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-NEXT: subpd %xmm5, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; CHECK-NEXT: addpd %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-NEXT: subpd %xmm5, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: movapd %xmm2, %xmm1 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v4f64_v4i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX-NEXT: vmovapd {{.*#+}} xmm4 = [4.503599627370496E+15,1.9342813113834067E+25] +; AVX-NEXT: vsubpd %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] +; AVX-NEXT: vaddpd %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX-NEXT: vsubpd %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX-NEXT: vaddpd %xmm1, %xmm5, %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX-NEXT: vsubpd %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] +; AVX-NEXT: vaddpd %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX-NEXT: vsubpd %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.uitofp.v4f64.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f32_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jns .LBB182_5 +; CHECK-NEXT: .LBB182_4: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm3 +; CHECK-NEXT: addss %xmm3, %xmm3 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jns .LBB182_8 +; CHECK-NEXT: .LBB182_7: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: jmp .LBB182_9 +; CHECK-NEXT: .LBB182_1: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: addss %xmm2, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_4 +; CHECK-NEXT: .LBB182_5: # %entry +; CHECK-NEXT: cvtsi2ss %rax, %xmm3 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_7 +; CHECK-NEXT: .LBB182_8: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: .LBB182_9: # %entry +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_10 +; CHECK-NEXT: # %bb.11: # %entry +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB182_10: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_uitofp_v4f32_v4i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: js .LBB182_1 +; AVX-NEXT: # %bb.2: # %entry +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: jns .LBB182_5 +; AVX-NEXT: .LBB182_4: +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq %rcx +; AVX-NEXT: andl $1, %eax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX-NEXT: jmp .LBB182_6 +; AVX-NEXT: .LBB182_1: +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq %rcx +; AVX-NEXT: andl $1, %eax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: js .LBB182_4 +; AVX-NEXT: .LBB182_5: # %entry +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX-NEXT: .LBB182_6: # %entry +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: js .LBB182_7 +; AVX-NEXT: # %bb.8: # %entry +; AVX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: jns .LBB182_11 +; AVX-NEXT: .LBB182_10: +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq %rcx +; AVX-NEXT: andl $1, %eax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; AVX-NEXT: .LBB182_7: +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq %rcx +; AVX-NEXT: andl $1, %eax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: testq %rax, %rax +; AVX-NEXT: js .LBB182_10 +; AVX-NEXT: .LBB182_11: # %entry +; AVX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + attributes #0 = { strictfp } ; Single width declarations @@ -5526,6 +6829,14 @@ declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.uitofp.v2f32.v2i32(<2 x i32>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.uitofp.v2f32.v2i64(<2 x i64>, metadata, metadata) ; Scalar width declarations declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata) @@ -5561,6 +6872,14 @@ declare <1 x float> @llvm.experimental.constrained.floor.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.round.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.trunc.v1f32(<1 x float>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i32(<1 x i32>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.sitofp.v1f32.v1i32(<1 x i32>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.sitofp.v1f32.v1i64(<1 x i64>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i32(<1 x i32>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.uitofp.v1f32.v1i32(<1 x i32>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.uitofp.v1f32.v1i64(<1 x i64>, metadata, metadata) ; Illegal width declarations declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata) @@ -5619,6 +6938,14 @@ declare <3 x double> @llvm.experimental.constrained.round.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.trunc.v3f32(<3 x float>, metadata, metadata) declare <3 x double> @llvm.experimental.constrained.trunc.v3f64(<3 x double>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.uitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.uitofp.v3f32.v3i32(<3 x i32>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.uitofp.v3f64.v3i64(<3 x i64>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.uitofp.v3f32.v3i64(<3 x i64>, metadata, metadata) ; Double width declarations declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata) @@ -5654,3 +6981,12 @@ declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.sitofp.v4f64.v4i32(<4 x i32>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i32(<4 x i32>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.sitofp.v4f64.v4i64(<4 x i64>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.uitofp.v4f64.v4i32(<4 x i32>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i32(<4 x i32>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.uitofp.v4f64.v4i64(<4 x i64>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64>, metadata, metadata) + diff --git a/llvm/test/Feature/fp-intrinsics.ll b/llvm/test/Feature/fp-intrinsics.ll --- a/llvm/test/Feature/fp-intrinsics.ll +++ b/llvm/test/Feature/fp-intrinsics.ll @@ -373,6 +373,28 @@ ret i64 %result } +; Verify that sitofp(42) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: @f30 +; CHECK: call double @llvm.experimental.constrained.sitofp +define double @f30() #0 { +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 42, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +; Verify that uitofp(42) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: @f31 +; CHECK: call double @llvm.experimental.constrained.uitofp +define double @f31() #0 { +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 42, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + attributes #0 = { strictfp } @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" @@ -405,3 +427,5 @@ declare i32 @llvm.experimental.constrained.lround.i32.f32(float, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f32(float, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata)