Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -15552,6 +15552,78 @@ The result produced is a signed integer converted from the floating point operand. The value is truncated, so it is rounded towards zero. +'``llvm.experimental.constrained.uitofp``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.uitofp( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.uitofp``' intrinsic converts an +unsigned integer ``value`` to a floating-point of type ``ty2``. + +Arguments: +"""""""""" + +The first argument to the '``llvm.experimental.constrained.uitofp``' +intrinsic must be an :ref:`integer ` or :ref:`vector +` of integer values. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +An inexact floating-point exception will be raised if rounding is required. +Any result produced is a floating point value converted from the input +integer operand. + +'``llvm.experimental.constrained.sitofp``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.sitofp( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.sitofp``' intrinsic converts a +signed integer ``value`` to a floating-point of type ``ty2``. + +Arguments: +"""""""""" + +The first argument to the '``llvm.experimental.constrained.sitofp``' +intrinsic must be an :ref:`integer ` or :ref:`vector +` of integer values. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +An inexact floating-point exception will be raised if rounding is required. +Any result produced is a floating point value converted from the input +integer operand. + '``llvm.experimental.constrained.fptrunc``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: llvm/include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- llvm/include/llvm/CodeGen/ISDOpcodes.h +++ llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -310,6 +310,13 @@ STRICT_FP_TO_SINT, STRICT_FP_TO_UINT, + /// STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to + /// a floating point value. These have the same semantics as sitofp and + /// uitofp in IR. + /// They are used to limit optimizations while the DAG is being optimized. + STRICT_SINT_TO_FP, + STRICT_UINT_TO_FP, + /// X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating /// point type down to the precision of the destination VT. TRUNC is a /// flag, which is always an integer that is zero or one. If TRUNC is 0, Index: llvm/include/llvm/CodeGen/SelectionDAG.h =================================================================== --- llvm/include/llvm/CodeGen/SelectionDAG.h +++ llvm/include/llvm/CodeGen/SelectionDAG.h @@ -811,6 +811,11 @@ /// float type VT, by either extending or rounding (by truncation). SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT); + /// Convert Op, which must be a STRICT operation of float type, to the + /// float type VT, by either extending or rounding (by truncation). + std::pair + getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT); + /// Convert Op, which must be of integer type, to the /// integer type VT, by either any-extending or truncating it. SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT); Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -4124,13 +4124,15 @@ /// \param N Node to expand /// \param Result output after conversion /// \returns True, if the expansion was successful, false otherwise - bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SDValue &Chain, SelectionDAG &DAG) const; + bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SDValue &Chain, + SelectionDAG &DAG) const; /// Expand UINT(i64) to double(f64) conversion /// \param N Node to expand /// \param Result output after conversion /// \returns True, if the expansion was successful, false otherwise - bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SDValue &Chain, + SelectionDAG &DAG) const; /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs. SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const; Index: llvm/include/llvm/IR/ConstrainedOps.def =================================================================== --- llvm/include/llvm/IR/ConstrainedOps.def +++ llvm/include/llvm/IR/ConstrainedOps.def @@ -36,6 +36,8 @@ INSTRUCTION(FDiv, 2, 1, experimental_constrained_fdiv, FDIV) INSTRUCTION(FRem, 2, 1, experimental_constrained_frem, FREM) INSTRUCTION(FPExt, 1, 0, experimental_constrained_fpext, FP_EXTEND) +INSTRUCTION(SIToFP, 1, 1, experimental_constrained_sitofp, SINT_TO_FP) +INSTRUCTION(UIToFP, 1, 1, experimental_constrained_uitofp, UINT_TO_FP) INSTRUCTION(FPToSI, 1, 0, experimental_constrained_fptosi, FP_TO_SINT) INSTRUCTION(FPToUI, 1, 0, experimental_constrained_fptoui, FP_TO_UINT) INSTRUCTION(FPTrunc, 1, 1, experimental_constrained_fptrunc, FP_ROUND) Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -640,6 +640,16 @@ [ llvm_anyfloat_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_sitofp : Intrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyint_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; + + def int_experimental_constrained_uitofp : Intrinsic<[ llvm_anyfloat_ty ], + [ llvm_anyint_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_fptrunc : Intrinsic<[ llvm_anyfloat_ty ], [ llvm_anyfloat_ty, llvm_metadata_ty, Index: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -172,7 +172,7 @@ SDValue NewIntValue) const; SDValue ExpandFCOPYSIGN(SDNode *Node) const; SDValue ExpandFABS(SDNode *Node) const; - SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, EVT DestVT, + SDValue ExpandLegalINT_TO_FP(bool isSigned, SDNode *Node, EVT DestVT, const SDLoc &dl); SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned, const SDLoc &dl); @@ -1016,6 +1016,14 @@ Action = TLI.getOperationAction(Node->getOpcode(), Node->getOperand(0).getValueType()); break; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + // These pseudo-ops are the same as the other STRICT_ ops except + // they are registered with setOperationAction() using the input type + // instead of the output type. + Action = TLI.getOperationAction(Node->getOpcode(), + Node->getOperand(1).getValueType()); + break; case ISD::STRICT_LRINT: case ISD::STRICT_LLRINT: case ISD::STRICT_LROUND: @@ -2337,9 +2345,11 @@ /// INT_TO_FP operation of the specified operand when the target requests that /// we expand it. At this point, we know that the result and operand types are /// legal for the target. -SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, +SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDNode *Node, EVT DestVT, const SDLoc &dl) { + unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0; + SDValue Op0 = Node->getOperand(OpNo); EVT SrcVT = Op0.getValueType(); // TODO: Should any fast-math-flags be set for the created nodes? @@ -2387,15 +2397,39 @@ BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // subtract the bias - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias); + SDValue Sub; + if (Node->isStrictFPOpcode()) { + Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other}, + {Node->getOperand(0), Load, Bias}); + } else + Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias); // final result - SDValue Result = DAG.getFPExtendOrRound(Sub, dl, DestVT); + SDValue OldChain = SDValue(Node, 1); + SDValue Result, NewChain; + if (Node->isStrictFPOpcode()) { + if (!DestVT.bitsEq(Sub.getValueType())) { + std::pair ResultPair; + ResultPair = DAG.getStrictFPExtendOrRound(Sub, OldChain, dl, DestVT); + Result = ResultPair.first; + NewChain = ResultPair.second; + } + else + Result = Sub; + // Finally relink the chain + DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain); + } else + Result = DAG.getFPExtendOrRound(Sub, dl, DestVT); return Result; } assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet"); // Code below here assumes !isSigned without checking again. - SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0); + SDValue Tmp1; + if (Node->isStrictFPOpcode()) { + Tmp1 = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, { DestVT, MVT::Other }, + { Node->getOperand(0), Op0 }); + } else + Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0); SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(SrcVT), Op0, DAG.getConstant(0, dl, SrcVT), ISD::SETLT); @@ -2441,6 +2475,14 @@ FudgeInReg = Handle.getValue(); } + if (Node->isStrictFPOpcode()) { + SDValue Result = DAG.getNode(ISD::STRICT_FADD, dl, { DestVT, MVT::Other }, + { Tmp1.getValue(1), Tmp1, FudgeInReg }); + // Relink the chain + DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Result.getValue(1)); + return Result; + } + return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg); } @@ -2898,14 +2940,30 @@ break; } case ISD::UINT_TO_FP: - if (TLI.expandUINT_TO_FP(Node, Tmp1, DAG)) { - Results.push_back(Tmp1); + case ISD::STRICT_UINT_TO_FP: + if (TLI.expandUINT_TO_FP(Node, Tmp1, Tmp2, DAG)) { + if (Node->isStrictFPOpcode()) { + // Relink the chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Node,1), Tmp2); + // Replace the new UINT result. + ReplaceNodeWithValue(SDValue(Node, 0), Tmp1); + LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_UINT_TO_FP node\n"); + } else + Results.push_back(Tmp1); break; } LLVM_FALLTHROUGH; case ISD::SINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + if (Node->isStrictFPOpcode()) { + Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::STRICT_SINT_TO_FP, + Node, Node->getValueType(0), dl); + ReplaceNode(Node, Tmp1.getNode()); + LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_xINT_TO_FP node\n"); + return true; + } Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP, - Node->getOperand(0), Node->getValueType(0), dl); + Node, Node->getValueType(0), dl); Results.push_back(Tmp1); break; case ISD::FP_TO_SINT: Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -307,13 +307,18 @@ return TranslateLegalizeResults(Op, Result); TargetLowering::LegalizeAction Action = TargetLowering::Legal; + EVT ValVT; switch (Op.getOpcode()) { default: return TranslateLegalizeResults(Op, Result); #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \ case ISD::STRICT_##DAGN: #include "llvm/IR/ConstrainedOps.def" - Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); + ValVT = Node->getValueType(0); + if (Op.getOpcode() == ISD::STRICT_SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_UINT_TO_FP) + ValVT = Node->getOperand(1).getValueType(); + Action = TLI.getOperationAction(Node->getOpcode(), ValVT); // If we're asked to expand a strict vector floating-point operation, // by default we're going to simply unroll it. That is usually the // best approach, except in the case where the resulting strict (scalar) @@ -1153,17 +1158,27 @@ } SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) { - EVT VT = Op.getOperand(0).getValueType(); + bool IsStrict = Op.getNode()->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + EVT VT = Op.getOperand(OpNo).getValueType(); SDLoc DL(Op); // Attempt to expand using TargetLowering. SDValue Result; - if (TLI.expandUINT_TO_FP(Op.getNode(), Result, DAG)) + SDValue Chain; + if (TLI.expandUINT_TO_FP(Op.getNode(), Result, Chain, DAG)) { + if (IsStrict) + // Relink the chain + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Chain); return Result; + } // Make sure that the SINT_TO_FP and SRL instructions are available. - if (TLI.getOperationAction(ISD::SINT_TO_FP, VT) == TargetLowering::Expand || - TLI.getOperationAction(ISD::SRL, VT) == TargetLowering::Expand) + if (((!IsStrict && TLI.getOperationAction(ISD::SINT_TO_FP, VT) == + TargetLowering::Expand) || + (IsStrict && TLI.getOperationAction(ISD::STRICT_SINT_TO_FP, VT) == + TargetLowering::Expand)) || + TLI.getOperationAction(ISD::SRL, VT) == TargetLowering::Expand) return DAG.UnrollVectorOp(Op.getNode()); unsigned BW = VT.getScalarSizeInBits(); @@ -1185,6 +1200,29 @@ SDValue HI = DAG.getNode(ISD::SRL, DL, VT, Op.getOperand(0), HalfWord); SDValue LO = DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), HalfWordMask); + if (IsStrict) { + // Convert hi and lo to floats + // Convert the hi part back to the upper values + // TODO: Can any fast-math-flags be set on these nodes? + SDValue fHI = + DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {Op.getValueType(), MVT::Other}, + {Op.getOperand(0), HI}); + fHI = DAG.getNode(ISD::STRICT_FMUL, DL, {Op.getValueType(), MVT::Other}, + {SDValue(fHI.getNode(), 1), fHI, TWOHW}); + SDValue fLO = + DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {Op.getValueType(), MVT::Other}, + {SDValue(fHI.getNode(), 1), LO}); + + // Add the two halves + SDValue Result = + DAG.getNode(ISD::STRICT_FADD, DL, {Op.getValueType(), MVT::Other}, + {SDValue(fLO.getNode(), 1), fHI, fLO}); + + // Relink the chain + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), SDValue(Result.getNode(), 1)); + return Result; + } + // Convert hi and lo to floats // Convert the hi part back to the upper values // TODO: Can any fast-math-flags be set on these nodes? Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -587,6 +587,8 @@ case ISD::UINT_TO_FP: Res = ScalarizeVecOp_UnaryOp(N); break; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: Res = ScalarizeVecOp_UnaryOp_StrictFP(N); @@ -1261,6 +1263,8 @@ case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: SplitVecRes_UnaryOp(N, Lo, Hi); return; default: @@ -1977,9 +1981,12 @@ case ISD::VSELECT: Res = SplitVecOp_VSELECT(N, OpNo); break; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: - if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType())) + if (N->getValueType(0).bitsLT( + N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType())) Res = SplitVecOp_TruncateHelper(N); else Res = SplitVecOp_UnaryOp(N); @@ -2540,7 +2547,8 @@ // // Without this transform, the original truncate would end up being // scalarized, which is pretty much always a last resort. - SDValue InVec = N->getOperand(0); + unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0; + SDValue InVec = N->getOperand(OpNo); EVT InVT = InVec->getValueType(0); EVT OutVT = N->getValueType(0); unsigned NumElements = OutVT.getVectorNumElements(); @@ -2584,8 +2592,23 @@ EVT::getIntegerVT(*DAG.getContext(), InElementSize/2); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements/2); - SDValue HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec); - SDValue HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec); + + SDValue HalfLo; + SDValue HalfHi; + SDValue Chain; + if (N->isStrictFPOpcode()) { + HalfLo = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other}, + {N->getOperand(0), HalfLo}); + HalfHi = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other}, + {N->getOperand(0), HalfHi}); + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, HalfLo.getValue(1), + HalfHi.getValue(1)); + } else { + HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec); + HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec); + } // Concatenate them to get the full intermediate truncation result. EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements); SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo, @@ -2594,6 +2617,17 @@ // type. This should normally be something that ends up being legal directly, // but in theory if a target has very wide vectors and an annoyingly // restricted set of legal types, this split can chain to build things up. + + if (N->isStrictFPOpcode()) { + SDValue Res = DAG.getNode( + ISD::STRICT_FP_ROUND, DL, {OutVT, MVT::Other}, + {Chain, InterVec, + DAG.getTargetConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()))}); + // Relink the chain + ReplaceValueWith(SDValue(N, 1), SDValue(Res.getNode(), 1)); + return Res; + } + return IsFloat ? DAG.getNode(ISD::FP_ROUND, DL, OutVT, InterVec, DAG.getTargetConstant( @@ -3046,6 +3080,8 @@ case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: return WidenVecRes_Convert_StrictFP(N); default: break; @@ -4128,7 +4164,9 @@ case ISD::FP_TO_UINT: case ISD::STRICT_FP_TO_UINT: case ISD::SINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: case ISD::UINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::TRUNCATE: Res = WidenVecOp_Convert(N); break; Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1113,6 +1113,18 @@ : getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL)); } +std::pair +SelectionDAG::getStrictFPExtendOrRound(SDValue Op, SDValue Chain, + const SDLoc &DL, EVT VT) { + SDValue Res = + VT.bitsGT(Op.getValueType()) + ? getNode(ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other}, {Chain, Op}) + : getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other}, + {Chain, Op, getIntPtrConstant(0, DL)}); + + return std::pair(Res, SDValue(Res.getNode(), 1)); +} + SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::ANY_EXTEND, DL, VT, Op) : @@ -7322,8 +7334,21 @@ if (VTList.NumVTs == 1) return getNode(Opcode, DL, VTList.VTs[0], Ops); + switch (Opcode) { // FIXME: Can we share all the optimizations with + // single return-value getNode()? + case ISD::STRICT_FP_ROUND: // FIXME: I believe this is IEEE-correct? + EVT VT = VTList.VTs[0]; + SDValue N1 = Ops[1]; + SDValue N2 = Ops[2]; + ConstantSDNode *N2C = dyn_cast(N2); + assert(VT.isFloatingPoint() && N1.getValueType().isFloatingPoint() && + VT.bitsLE(N1.getValueType()) && N2C && + (N2C->getZExtValue() == 0 || N2C->getZExtValue() == 1) && + "Invalid STRICT_FP_ROUND!"); + if (N1.getValueType() == VT) + return N1; // noop conversion. + break; #if 0 - switch (Opcode) { // FIXME: figure out how to safely handle things like // int foo(int x) { return 1 << (x & 255); } // int bar() { return foo(256); } @@ -7342,8 +7367,8 @@ return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0)); } break; - } #endif + } // Memoize the node unless it returns a flag. SDNode *N; @@ -9129,7 +9154,8 @@ } SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { - assert(N->getNumValues() == 1 && + assert((N->getNumValues() == 1 || + (N->getNumValues() == 2 && N->getValueType(1) == MVT::Other)) && "Can't unroll a vector with multiple results!"); EVT VT = N->getValueType(0); @@ -9139,6 +9165,9 @@ SmallVector Scalars; SmallVector Operands(N->getNumOperands()); + SmallVector EltVTs = {EltVT}; + if (N->getNumValues() == 2) + EltVTs.push_back(MVT::Other); // If ResNE is 0, fully unroll the vector op. if (ResNE == 0) @@ -9165,8 +9194,11 @@ switch (N->getOpcode()) { default: { - Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands, - N->getFlags())); + // Use EltVTs here since the node may be chained. Singular EltVT + // is used for cases where we know there is no chain. + SDValue Scalar = getNode(N->getOpcode(), dl, EltVTs, Operands); + Scalar.getNode()->setFlags(N->getFlags()); + Scalars.push_back(Scalar); break; } case ISD::VSELECT: @@ -9190,6 +9222,16 @@ } } + if (N->getNumValues() == 2) { + SmallVector Chains; + for (unsigned i = 0; i < Scalars.size(); i++) { + Chains.push_back(SDValue(Scalars[i].getNode(), 1)); + } + // Build a new factor node to connect the chain back together. + SDValue OutChain = getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + ReplaceAllUsesOfValueWith(SDValue(N, 1), OutChain); + } + for (; i < ResNE; ++i) Scalars.push_back(getUNDEF(EltVT)); Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -324,7 +324,9 @@ case ISD::STRICT_FP_EXTEND: return "strict_fp_extend"; case ISD::SINT_TO_FP: return "sint_to_fp"; + case ISD::STRICT_SINT_TO_FP: return "strict_sint_to_fp"; case ISD::UINT_TO_FP: return "uint_to_fp"; + case ISD::STRICT_UINT_TO_FP: return "strict_uint_to_fp"; case ISD::FP_TO_SINT: return "fp_to_sint"; case ISD::STRICT_FP_TO_SINT: return "strict_fp_to_sint"; case ISD::FP_TO_UINT: return "fp_to_uint"; Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6104,8 +6104,10 @@ } bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, + SDValue &Chain, SelectionDAG &DAG) const { - SDValue Src = Node->getOperand(0); + unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0; + SDValue Src = Node->getOperand(OpNo); EVT SrcVT = Src.getValueType(); EVT DstVT = Node->getValueType(0); @@ -6128,7 +6130,13 @@ // For unsigned conversions, convert them to signed conversions using the // algorithm from the x86_64 __floatundidf in compiler_rt. - SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); + SDValue Fast; + if (Node->isStrictFPOpcode()) { + Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other}, + {Node->getOperand(0), Src}); + Chain = SDValue(Fast.getNode(), 1); + } else + Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT); SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Src, ShiftConst); @@ -6136,8 +6144,17 @@ SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Src, AndConst); SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr); - SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or); - SDValue Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt); + SDValue Slow; + if (Node->isStrictFPOpcode()) { + SDValue SignCvt = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, + {DstVT, MVT::Other}, {Chain, Or}); + Slow = DAG.getNode(ISD::STRICT_FADD, dl, { DstVT, MVT::Other }, + { SignCvt.getValue(1), SignCvt, SignCvt }); + Chain = Slow.getValue(1); + } else { + SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or); + Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt); + } // TODO: This really should be implemented using a branch rather than a // select. We happen to get lucky and machinesink does the right @@ -6180,8 +6197,18 @@ SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84); SDValue LoFlt = DAG.getBitcast(DstVT, LoOr); SDValue HiFlt = DAG.getBitcast(DstVT, HiOr); - SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); - Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); + if (Node->isStrictFPOpcode()) { + SDValue HiSub = + DAG.getNode(ISD::STRICT_FSUB, dl, {DstVT, MVT::Other}, + {Node->getOperand(0), HiFlt, TwoP84PlusTwoP52}); + Result = DAG.getNode(ISD::STRICT_FADD, dl, {DstVT, MVT::Other}, + {HiSub.getValue(1), LoFlt, HiSub}); + Chain = Result.getValue(1); + } else { + SDValue HiSub = + DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); + Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); + } return true; } Index: llvm/lib/IR/Verifier.cpp =================================================================== --- llvm/lib/IR/Verifier.cpp +++ llvm/lib/IR/Verifier.cpp @@ -4785,6 +4785,30 @@ } break; + case Intrinsic::experimental_constrained_sitofp: + case Intrinsic::experimental_constrained_uitofp: { + Assert((NumOperands == 3), "invalid arguments for constrained FP intrinsic", + &FPI); + Value *Operand = FPI.getArgOperand(0); + uint64_t NumSrcElem = 0; + Assert(Operand->getType()->isIntOrIntVectorTy(), + "Intrinsic first argument must be integer", &FPI); + if (auto *OperandT = dyn_cast(Operand->getType())) { + NumSrcElem = OperandT->getNumElements(); + } + + Operand = &FPI; + Assert((NumSrcElem > 0) == Operand->getType()->isVectorTy(), + "Intrinsic first argument and result disagree on vector use", &FPI); + Assert(Operand->getType()->isFPOrFPVectorTy(), + "Intrinsic result must be a floating point", &FPI); + if (auto *OperandT = dyn_cast(Operand->getType())) { + Assert(NumSrcElem == OperandT->getNumElements(), + "Intrinsic first argument and result vector lengths must be equal", + &FPI); + } + } break; + case Intrinsic::experimental_constrained_fptrunc: case Intrinsic::experimental_constrained_fpext: { Value *Operand = FPI.getArgOperand(0); Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -233,9 +233,11 @@ // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. @@ -245,9 +247,11 @@ setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. @@ -981,9 +985,12 @@ setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); @@ -18398,8 +18405,12 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Op.getOpcode() == ISD::SINT_TO_FP || - Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"); - SDValue Src = Op.getOperand(0); + Op.getOpcode() == ISD::STRICT_SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_UINT_TO_FP || + Op.getOpcode() == ISD::UINT_TO_FP) && + "Unexpected opcode!"); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); @@ -18416,7 +18427,17 @@ SDLoc dl(Op); SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src); + if (Op.getNode()->isStrictFPOpcode()) { + SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other}, + {Op.getOperand(0), InVec}); + SDValue Chain = SDValue(CvtVec.getNode(), 1); + SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, + DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Value, Chain); + } + SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); } @@ -18487,7 +18508,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -18496,7 +18518,8 @@ return Extract; if (SrcVT.isVector()) { - if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { + if (SrcVT == MVT::v2i32 && VT == MVT::v2f64 && + !Op.getNode()->isStrictFPOpcode()) { return DAG.getNode(X86ISD::CVTSI2P, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))); @@ -18528,7 +18551,7 @@ if (VT == MVT::f128) return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); - SDValue ValueToStore = Op.getOperand(0); + SDValue ValueToStore = Op.getOperand(OpNo); if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come @@ -18630,6 +18653,7 @@ #endif */ + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); @@ -18650,8 +18674,8 @@ SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); // Load the 64-bit value into an XMM register. - SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, - Op.getOperand(0)); + SDValue XR1 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), @@ -18664,15 +18688,33 @@ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /* Alignment = */ 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); + SDValue Sub; + SDValue Chain; // TODO: Are there any fast-math-flags to propagate here? - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + if (Op.getNode()->isStrictFPOpcode()) { + Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, + {Op.getOperand(0), XR2F, CLod1}); + Chain = SDValue(Sub.getNode(), 1); + } else + Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { + // FIXME: Do we need a STRICT version of FHADD? Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); - Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); + if (Op.getNode()->isStrictFPOpcode()) { + Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other}, + {Chain, Shuffle, Sub}); + Chain = SDValue(Result.getNode(), 1); + } else + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); + } + if (Op.getNode()->isStrictFPOpcode()) { + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, + DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, Chain); } return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, @@ -18682,14 +18724,15 @@ /// 32-bit unsigned integer to float expansion. static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; SDLoc dl(Op); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // Load the 32-bit value into an XMM register. - SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, - Op.getOperand(0)); + SDValue Load = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo)); // Zero out the upper parts of the register. Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); @@ -18709,6 +18752,20 @@ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); + if (Op.getNode()->isStrictFPOpcode()) { + // Subtract the bias. + // TODO: Are there any fast-math-flags to propagate here? + SDValue Chain = Op.getOperand(0); + SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other}, + {Chain, Or, Bias}); + + // Handle final rounding. + return DAG + .getStrictFPExtendOrRound(Sub, SDValue(Sub.getNode(), 1), dl, + Op.getSimpleValueType()) + .first; + } + // Subtract the bias. // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); @@ -18723,6 +18780,9 @@ if (Op.getSimpleValueType() != MVT::v2f64) return SDValue(); + if (Op.getNode()->isStrictFPOpcode()) + return SDValue(); + SDValue N0 = Op.getOperand(0); assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"); @@ -18849,7 +18909,8 @@ static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - SDValue N0 = Op.getOperand(0); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue N0 = Op.getOperand(OpNo); MVT SrcVT = N0.getSimpleValueType(); SDLoc dl(Op); @@ -18867,11 +18928,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue N0 = Op.getOperand(0); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue N0 = Op.getOperand(OpNo); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); + SDValue Chain = + Op.getNode()->isStrictFPOpcode() ? Op.getOperand(0) : DAG.getEntryNode(); if (DstVT == MVT::f128) return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT)); @@ -18892,6 +18956,10 @@ // Promote i32 to i64 and use a signed conversion on 64-bit targets. if (SrcVT == MVT::i32 && Subtarget.is64Bit()) { N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0); + if (Op.getNode()->isStrictFPOpcode()) { + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other}, + {Chain, N0}); + } return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0); } @@ -18909,8 +18977,8 @@ SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl); - SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, MachinePointerInfo()); + SDValue Store1 = DAG.getStore(Chain, dl, Op.getOperand(OpNo), StackSlot, + MachinePointerInfo()); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), OffsetSlot, MachinePointerInfo()); SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); @@ -18918,14 +18986,14 @@ } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); - SDValue ValueToStore = Op.getOperand(0); + SDValue ValueToStore = Op.getOperand(OpNo); if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, - MachinePointerInfo()); + SDValue Store = + DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo()); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, @@ -18940,13 +19008,14 @@ SDValue Ops[] = { Store, StackSlot }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); + Chain = SDValue(Fild.getNode(), 1); APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. SDValue SignSet = DAG.getSetCC( dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), - Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); + Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. SDValue FudgePtr = DAG.getConstantPool( @@ -18961,11 +19030,19 @@ // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad( - ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, + ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, /* Alignment = */ 4); + Chain = SDValue(Fudge.getNode(), 1); // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? + if (Op.getNode()->isStrictFPOpcode()) { + SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other}, + {Chain, Fild, Fudge}); + return DAG.getNode( + ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other}, + {SDValue(Add.getNode(), 1), Add, DAG.getIntPtrConstant(0, dl)}); + } SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); @@ -27852,7 +27929,9 @@ case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::FSHL: case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); + case ISD::STRICT_SINT_TO_FP: case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::STRICT_UINT_TO_FP: case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); Index: llvm/test/CodeGen/X86/fp-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/fp-intrinsics.ll +++ llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -1799,6 +1799,426 @@ ret i64 %result } +; Verify that sitofp(%x) isn't simplified when the rounding mode is +; unknown. The expansion should have only one conversion instruction. +; Verify that no gross errors happen. +define double @sifdi(i32 %x) #0 { +; X87-LABEL: sifdi: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildl (%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: sifdi: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: cvtsi2sdl {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: sifdi: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsi2sd %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sifdi: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @siffi(i32 %x) #0 { +; X87-LABEL: siffi: +; X87: # %bb.0: # %entry +; X87-NEXT: pushl %eax +; X87-NEXT: .cfi_def_cfa_offset 8 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildl (%esp) +; X87-NEXT: popl %eax +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: siffi: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: siffi: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsi2ss %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: siffi: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define double @sifdl(i64 %x) #0 { +; X87-LABEL: sifdl: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X87-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildll (%esp) +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: sifdl: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fstpl (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: sifdl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsi2sd %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sifdl: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @siffl(i64 %x) #0 { +; X87-LABEL: siffl: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X87-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: fildll (%esp) +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: siffl: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: flds {{[0-9]+}}(%esp) +; X86-SSE-NEXT: addl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: siffl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsi2ss %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: siffl: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.sitofp.f32.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +; Verify that uitofp(%x) isn't simplified when the rounding mode is +; unknown. Expansions from i32 should have only one conversion instruction. +; Verify that no gross errors happen. +define double @uifdi(i32 %x) #0 { +; X87-LABEL: uifdi: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: movl $0, {{[0-9]+}}(%esp) +; X87-NEXT: fildll (%esp) +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uifdi: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: orpd %xmm0, %xmm1 +; X86-SSE-NEXT: subsd %xmm0, %xmm1 +; X86-SSE-NEXT: movsd %xmm1, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uifdi: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: cvtsi2sd %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uifdi: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: uifdi: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define double @uifdl(i64 %x) #0 { +; X87-LABEL: uifdl: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $20, %esp +; X87-NEXT: .cfi_def_cfa_offset 24 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X87-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: xorl %eax, %eax +; X87-NEXT: testl %ecx, %ecx +; X87-NEXT: setns %al +; X87-NEXT: fildll (%esp) +; X87-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; X87-NEXT: fstpl {{[0-9]+}}(%esp) +; X87-NEXT: fldl {{[0-9]+}}(%esp) +; X87-NEXT: addl $20, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uifdl: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X86-SSE-NEXT: subpd {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: movapd %xmm0, %xmm1 +; X86-SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; X86-SSE-NEXT: addpd %xmm0, %xmm1 +; X86-SSE-NEXT: movlpd %xmm1, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uifdl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movq %rdi, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: subpd {{.*}}(%rip), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uifdl: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: uifdl: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +define float @uiffi(i32 %x) #0 { +; X87-LABEL: uiffi: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl %eax, (%esp) +; X87-NEXT: movl $0, {{[0-9]+}}(%esp) +; X87-NEXT: fildll (%esp) +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uiffi: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: orpd %xmm0, %xmm1 +; X86-SSE-NEXT: subsd %xmm0, %xmm1 +; X86-SSE-NEXT: xorps %xmm0, %xmm0 +; X86-SSE-NEXT: cvtsd2ss %xmm1, %xmm0 +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uiffi: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: cvtsi2ss %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uiffi: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: uiffi: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + +define float @uiffl(i64 %x) #0 { +; X87-LABEL: uiffl: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $20, %esp +; X87-NEXT: .cfi_def_cfa_offset 24 +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax +; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X87-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X87-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X87-NEXT: xorl %eax, %eax +; X87-NEXT: testl %ecx, %ecx +; X87-NEXT: setns %al +; X87-NEXT: fildll {{[0-9]+}}(%esp) +; X87-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; X87-NEXT: fstps {{[0-9]+}}(%esp) +; X87-NEXT: flds {{[0-9]+}}(%esp) +; X87-NEXT: addl $20, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: uiffl: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: setns %al +; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: addl $20, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: uiffl: +; SSE: # %bb.0: # %entry +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: js .LBB42_1 +; SSE-NEXT: # %bb.2: # %entry +; SSE-NEXT: cvtsi2ss %rdi, %xmm0 +; SSE-NEXT: retq +; SSE-NEXT: .LBB42_1: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: shrq %rax +; SSE-NEXT: andl $1, %edi +; SSE-NEXT: orq %rax, %rdi +; SSE-NEXT: cvtsi2ss %rdi, %xmm0 +; SSE-NEXT: addss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: uiffl: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: testq %rdi, %rdi +; AVX1-NEXT: js .LBB42_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB42_1: +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: orq %rax, %rdi +; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: uiffl: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call float @llvm.experimental.constrained.uitofp.f32.i64(i64 %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %result +} + attributes #0 = { strictfp } @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" @@ -1837,3 +2257,11 @@ declare i32 @llvm.experimental.constrained.lround.i32.f32(float, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f32(float, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) +declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i64(i64, metadata, metadata) +declare float @llvm.experimental.constrained.sitofp.f32.i64(i64, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i64(i64, metadata, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i32(i32, metadata, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i64(i64, metadata, metadata) Index: llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -5690,6 +5690,1539 @@ ret <3 x double> %trunc } +define <1 x double> @constrained_vector_sitofp_v1f64_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f64_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2sd %edi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.sitofp.v1f64.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_sitofp_v1f32_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f32_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2ss %edi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.sitofp.v1f32.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <1 x double> @constrained_vector_sitofp_v1f64_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f64_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2sd %rdi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_sitofp_v1f32_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v1f32_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.sitofp.v1f32.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <2 x double> @constrained_vector_sitofp_v2f64_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f64_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %eax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f64_v2i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_sitofp_v2f32_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f32_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %eax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f32_v2i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <2 x double> @constrained_vector_sitofp_v2f64_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f64_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f64_v2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_sitofp_v2f32_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v2f32_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v2f32_v2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %eax, %xmm0 +; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v3f64_v3i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm2, %xmm2 +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vcvtsi2sd %eax, %xmm3, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %eax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ss %eax, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %eax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v3f32_v3i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm3, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <3 x double> @constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2sd %rdi, %xmm0 +; CHECK-NEXT: cvtsi2sd %rsi, %xmm1 +; CHECK-NEXT: cvtsi2sd %rdx, %xmm2 +; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_sitofp_v3f64_v3i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sitofp_v3f64_v3i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_sitofp_v3f32_v3i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sitofp_v3f32_v3i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <4 x double> @constrained_vector_sitofp_v4f64_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: cvtsi2sd %eax, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v4f64_v4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.sitofp.v4f64.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_sitofp_v4f32_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f32_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cvtdq2ps %xmm0, %xmm0 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_sitofp_v4f32_v4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.sitofp.v4f32.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + +define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: cvtsi2sd %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: cvtsi2sd %rax, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: movaps %xmm3, %xmm1 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_sitofp_v4f64_v4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sitofp_v4f64_v4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.sitofp.v4f64.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_sitofp_v4f32_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_sitofp_v4f32_v4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sitofp_v4f32_v4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + +define <1 x double> @constrained_vector_uitofp_v1f64_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.uitofp.v1f64.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_uitofp_v1f32_v1i32(<1 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f32_v1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v1f32_v1i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.uitofp.v1f32.v1i32(<1 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <1 x double> @constrained_vector_uitofp_v1f64_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: subpd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: addpd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call <1 x double> + @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x double> %result +} + +define <1 x float> @constrained_vector_uitofp_v1f32_v1i64(<1 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v1f32_v1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: js .LBB170_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB170_1: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: orq %rax, %rdi +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v1f32_v1i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: testq %rdi, %rdi +; AVX1-NEXT: js .LBB170_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB170_1: +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: orq %rax, %rdi +; AVX1-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: retq +entry: + %result = call <1 x float> + @llvm.experimental.constrained.uitofp.v1f32.v1i64(<1 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %result +} + +define <2 x double> @constrained_vector_uitofp_v2f64_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v2f64_v2i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrd $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm2, %xmm0 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.uitofp.v2f64.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_uitofp_v2f32_v2i32(<2 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f32_v2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrd $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.uitofp.v2f32.v2i32(<2 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: movapd {{.*#+}} xmm4 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm4, %xmm0 +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: addpd %xmm0, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-NEXT: subpd %xmm4, %xmm3 +; CHECK-NEXT: movapd %xmm3, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; CHECK-NEXT: addpd %xmm3, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] +; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vsubpd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v2f64_v2i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: retq +entry: + %result = call <2 x double> + @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %result +} + +define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v2f32_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB174_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jns .LBB174_5 +; CHECK-NEXT: .LBB174_4: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB174_1: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB174_4 +; CHECK-NEXT: .LBB174_5: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB174_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: jns .LBB174_5 +; AVX1-NEXT: .LBB174_4: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB174_1: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB174_4 +; AVX1-NEXT: .LBB174_5: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512-NEXT: retq +entry: + %result = call <2 x float> + @llvm.experimental.constrained.uitofp.v2f32.v2i64(<2 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x float> %result +} + +define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rax, %xmm0 +; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrd $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm2, %xmm2 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.uitofp.v3f64.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrd $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.uitofp.v3f32.v3i32(<3 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm3, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: addpd %xmm1, %xmm0 +; CHECK-NEXT: movq %rsi, %xmm4 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-NEXT: subpd %xmm3, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm1 +; CHECK-NEXT: movq %rdx, %xmm4 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; CHECK-NEXT: subpd %xmm3, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm2 +; CHECK-NEXT: movlpd %xmm2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] +; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-NEXT: vsubpd %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm4[1,0] +; AVX1-NEXT: vaddpd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vsubpd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <3 x double> + @llvm.experimental.constrained.uitofp.v3f64.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %result +} + +define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: js .LBB178_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: jns .LBB178_5 +; CHECK-NEXT: .LBB178_4: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: orq %rax, %rdi +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: jns .LBB178_8 +; CHECK-NEXT: .LBB178_7: +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rax, %rdx +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB178_1: +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: orq %rax, %rsi +; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: js .LBB178_4 +; CHECK-NEXT: .LBB178_5: # %entry +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: js .LBB178_7 +; CHECK-NEXT: .LBB178_8: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB178_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: jns .LBB178_5 +; AVX1-NEXT: .LBB178_4: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: jmp .LBB178_6 +; AVX1-NEXT: .LBB178_1: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB178_4 +; AVX1-NEXT: .LBB178_5: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: .LBB178_6: # %entry +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB178_7 +; AVX1-NEXT: # %bb.8: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB178_7: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %result = call <3 x float> + @llvm.experimental.constrained.uitofp.v3f32.v3i64(<3 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %result +} + +define <4 x double> @constrained_vector_uitofp_v4f64_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: cvtsi2sd %rax, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v4f64_v4i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm1, %xmm1 +; AVX512-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm2, %xmm2 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vpextrd $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm3, %xmm2 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm3, %xmm0 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.uitofp.v4f64.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_uitofp_v4f32_v4i32(<4 x i32> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f32_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v4f32_v4i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrd $1, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm3, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.uitofp.v4f32.v4i32(<4 x i32> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + +define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: movapd {{.*#+}} xmm5 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm5, %xmm2 +; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; CHECK-NEXT: addpd %xmm2, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-NEXT: subpd %xmm5, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm2 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-NEXT: subpd %xmm5, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; CHECK-NEXT: addpd %xmm1, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-NEXT: subpd %xmm5, %xmm4 +; CHECK-NEXT: movapd %xmm4, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; CHECK-NEXT: addpd %xmm4, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: movapd %xmm2, %xmm1 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503599627370496E+15,1.9342813113834067E+25] +; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] +; AVX1-NEXT: vaddpd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX1-NEXT: vaddpd %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0] +; AVX1-NEXT: vaddpd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v4f64_v4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq +entry: + %result = call <4 x double> + @llvm.experimental.constrained.uitofp.v4f64.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %result +} + +define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { +; CHECK-LABEL: constrained_vector_uitofp_v4f32_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jns .LBB182_5 +; CHECK-NEXT: .LBB182_4: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm3 +; CHECK-NEXT: addss %xmm3, %xmm3 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jns .LBB182_8 +; CHECK-NEXT: .LBB182_7: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: jmp .LBB182_9 +; CHECK-NEXT: .LBB182_1: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: cvtsi2ss %rax, %xmm2 +; CHECK-NEXT: addss %xmm2, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_4 +; CHECK-NEXT: .LBB182_5: # %entry +; CHECK-NEXT: cvtsi2ss %rax, %xmm3 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_7 +; CHECK-NEXT: .LBB182_8: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2ss %rax, %xmm1 +; CHECK-NEXT: .LBB182_9: # %entry +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB182_10 +; CHECK-NEXT: # %bb.11: # %entry +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB182_10: +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq +; +; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB182_1 +; AVX1-NEXT: # %bb.2: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: jns .LBB182_5 +; AVX1-NEXT: .LBB182_4: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: jmp .LBB182_6 +; AVX1-NEXT: .LBB182_1: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB182_4 +; AVX1-NEXT: .LBB182_5: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: .LBB182_6: # %entry +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB182_7 +; AVX1-NEXT: # %bb.8: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: jns .LBB182_11 +; AVX1-NEXT: .LBB182_10: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB182_7: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB182_10 +; AVX1-NEXT: .LBB182_11: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_uitofp_v4f32_v4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %result = call <4 x float> + @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x float> %result +} + attributes #0 = { strictfp } ; Single width declarations @@ -5726,6 +7259,14 @@ declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.uitofp.v2f32.v2i32(<2 x i32>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64>, metadata, metadata) +declare <2 x float> @llvm.experimental.constrained.uitofp.v2f32.v2i64(<2 x i64>, metadata, metadata) ; Scalar width declarations declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata) @@ -5761,6 +7302,14 @@ declare <1 x float> @llvm.experimental.constrained.floor.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.round.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.trunc.v1f32(<1 x float>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i32(<1 x i32>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.sitofp.v1f32.v1i32(<1 x i32>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.sitofp.v1f32.v1i64(<1 x i64>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i32(<1 x i32>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.uitofp.v1f32.v1i32(<1 x i32>, metadata, metadata) +declare <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.uitofp.v1f32.v1i64(<1 x i64>, metadata, metadata) ; Illegal width declarations declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata) @@ -5819,6 +7368,14 @@ declare <3 x double> @llvm.experimental.constrained.round.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.trunc.v3f32(<3 x float>, metadata, metadata) declare <3 x double> @llvm.experimental.constrained.trunc.v3f64(<3 x double>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.uitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.uitofp.v3f32.v3i32(<3 x i32>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.uitofp.v3f64.v3i64(<3 x i64>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.uitofp.v3f32.v3i64(<3 x i64>, metadata, metadata) ; Double width declarations declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata) @@ -5854,3 +7411,12 @@ declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.sitofp.v4f64.v4i32(<4 x i32>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i32(<4 x i32>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.sitofp.v4f64.v4i64(<4 x i64>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.uitofp.v4f64.v4i32(<4 x i32>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i32(<4 x i32>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.uitofp.v4f64.v4i64(<4 x i64>, metadata, metadata) +declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64>, metadata, metadata) + Index: llvm/test/Feature/fp-intrinsics.ll =================================================================== --- llvm/test/Feature/fp-intrinsics.ll +++ llvm/test/Feature/fp-intrinsics.ll @@ -373,6 +373,28 @@ ret i64 %result } +; Verify that sitofp(42) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: @f30 +; CHECK: call double @llvm.experimental.constrained.sitofp +define double @f30() #0 { +entry: + %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 42, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + +; Verify that uitofp(42) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: @f31 +; CHECK: call double @llvm.experimental.constrained.uitofp +define double @f31() #0 { +entry: + %result = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 42, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + attributes #0 = { strictfp } @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" @@ -405,3 +427,5 @@ declare i32 @llvm.experimental.constrained.lround.i32.f32(float, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.llround.i64.f32(float, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata)