diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1013,7 +1013,9 @@ setOperationAction(ISD::SELECT, MVT::v16i8, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); @@ -1248,6 +1250,7 @@ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); @@ -1775,7 +1778,6 @@ Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Subtarget.hasVLX() ? Legal : Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, @@ -21215,6 +21217,44 @@ llvm_unreachable("All 256->128 cases should have been handled above!"); } +// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction +// behaves on out of range inputs to generate optimized conversions. +static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + MVT SrcVT = Src.getSimpleValueType(); + unsigned DstBits = VT.getScalarSizeInBits(); + assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"); + + // Calculate the converted result for values in the range 0 to + // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big"). + SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src); + SDValue Big = + DAG.getNode(X86ISD::CVTTP2SI, dl, VT, + DAG.getNode(ISD::FSUB, dl, SrcVT, Src, + DAG.getConstantFP(2147483648.0f, dl, SrcVT))); + + // The "CVTTP2SI" instruction conveniently sets the sign bit if + // and only if the value was out of range. So we can use that + // as our indicator that we rather use "Big" instead of "Small". + // + // Use "Small" if "IsOverflown" has all bits cleared + // and "0x80000000 | Big" if all bits in "IsOverflown" are set. + + // AVX1 can't use the signsplat masking for 256-bit vectors - we have to + // use the slightly slower blendv select instead. + if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) { + SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big); + return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small); + } + + SDValue IsOverflown = + DAG.getNode(X86ISD::VSRAI, dl, VT, Small, + DAG.getTargetConstant(DstBits - 1, dl, MVT::i8)); + return DAG.getNode(ISD::OR, dl, VT, Small, + DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown)); +} + SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT || @@ -21274,10 +21314,10 @@ // Widen vXi32 fp_to_uint with avx512f to 512-bit source. if ((VT == MVT::v4i32 || VT == MVT::v8i32) && - (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) { + (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) && + Subtarget.useAVX512Regs()) { assert(!IsSigned && "Expected unsigned conversion!"); - assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && - "Unexpected features!"); + assert(!Subtarget.hasVLX() && "Unexpected features!"); MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32; MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32; // Need to concat with zero vector for strict fp to avoid spurious @@ -21307,9 +21347,9 @@ // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source. if ((VT == MVT::v2i64 || VT == MVT::v4i64) && - (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) { - assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() && - !Subtarget.hasVLX() && "Unexpected features!"); + (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) && + Subtarget.useAVX512Regs() && Subtarget.hasDQI()) { + assert(!Subtarget.hasVLX() && "Unexpected features!"); MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; // Need to concat with zero vector for strict fp to avoid spurious // exceptions. @@ -21366,6 +21406,15 @@ return DAG.getNode(Opc, dl, VT, Tmp); } + // Generate optimized instructions for pre AVX512 unsigned conversions from + // vXf32 to vXi32. + if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) || + (VT == MVT::v4i32 && SrcVT == MVT::v4f64) || + (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) { + assert(!IsSigned && "Expected unsigned conversion!"); + return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget); + } + return SDValue(); } @@ -21378,6 +21427,39 @@ if (Subtarget.hasAVX512()) return Op; + // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction + // behaves on out of range inputs to generate optimized conversions. + if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) || + (VT == MVT::i64 && Subtarget.is64Bit()))) { + unsigned DstBits = VT.getScalarSizeInBits(); + APInt UIntLimit = APInt::getSignMask(DstBits); + SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT, + DAG.getConstant(UIntLimit, dl, VT)); + MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits()); + + // Calculate the converted result for values in the range: + // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big"). + // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big"). + SDValue Small = + DAG.getNode(X86ISD::CVTTS2SI, dl, VT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src)); + SDValue Big = DAG.getNode( + X86ISD::CVTTS2SI, dl, VT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, + DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset))); + + // The "CVTTS2SI" instruction conveniently sets the sign bit if + // and only if the value was out of range. So we can use that + // as our indicator that we rather use "Big" instead of "Small". + // + // Use "Small" if "IsOverflown" has all bits cleared + // and "0x80000000 | Big" if all bits in "IsOverflown" are set. + SDValue IsOverflown = DAG.getNode( + ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8)); + return DAG.getNode(ISD::OR, dl, VT, Small, + DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown)); + } + // Use default expansion for i64. if (VT == MVT::i64) return SDValue(); @@ -30781,12 +30863,19 @@ if (VT == MVT::v2i32) { - assert((IsSigned || Subtarget.hasAVX512()) && - "Can only handle signed conversion without AVX512"); + assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) && + "Strict unsigned conversion requires AVX512"); assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && "Unexpected type action!"); if (Src.getValueType() == MVT::v2f64) { + if (!IsSigned && !Subtarget.hasAVX512()) { + SDValue Res = + expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget); + Results.push_back(Res); + return; + } + unsigned Opc; if (IsStrict) Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1914,12 +1914,14 @@ { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 }, { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 }, { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 }, - { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 }, - { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 7 }, - { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 7 }, - { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 4 }, - { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, @@ -2026,10 +2028,11 @@ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 }, { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 }, { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 }, - { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 9 }, - { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 9 }, - { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 9 }, - { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 9 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 }, { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, @@ -2097,15 +2100,15 @@ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 }, { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 }, - { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 5 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 }, - { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 5 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 }, { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 }, { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 }, { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 }, - { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 6 }, - { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 3 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, }; static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { diff --git a/llvm/test/Analysis/CostModel/X86/fptoui.ll b/llvm/test/Analysis/CostModel/X86/fptoui.ll --- a/llvm/test/Analysis/CostModel/X86/fptoui.ll +++ b/llvm/test/Analysis/CostModel/X86/fptoui.ll @@ -19,18 +19,25 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'fptoui_double_i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = fptoui double undef to i64 -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> -; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> -; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX-LABEL: 'fptoui_double_i64' -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = fptoui double undef to i64 -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-LABEL: 'fptoui_double_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'fptoui_double_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = fptoui double undef to i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'fptoui_double_i64' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui double undef to i64 @@ -47,10 +54,10 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'fptoui_double_i64' -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = fptoui double undef to i64 -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> -; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64> +; SLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64> +; SLM-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = fptoui double undef to i64 @@ -70,23 +77,23 @@ ; ; SSE42-LABEL: 'fptoui_double_i32' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'fptoui_double_i32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'fptoui_double_i32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'fptoui_double_i32' @@ -98,9 +105,9 @@ ; ; SLM-LABEL: 'fptoui_double_i32' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I32 = fptoui double undef to i32 @@ -206,20 +213,28 @@ ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'fptoui_float_i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = fptoui float undef to i64 -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> -; SSE42-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> -; SSE42-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> -; SSE42-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64 +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; SSE42-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX-LABEL: 'fptoui_float_i64' -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = fptoui float undef to i64 -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-LABEL: 'fptoui_float_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64 +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'fptoui_float_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = fptoui float undef to i64 +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'fptoui_float_i64' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptoui float undef to i64 @@ -238,11 +253,11 @@ ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'fptoui_float_i64' -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = fptoui float undef to i64 -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> -; SLM-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> -; SLM-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> -; SLM-NEXT: Cost Model: Found an estimated cost of 164 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64 +; SLM-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64> +; SLM-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64> +; SLM-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64> +; SLM-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = fptoui float undef to i64 @@ -264,26 +279,26 @@ ; ; SSE42-LABEL: 'fptoui_float_i32' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'fptoui_float_i32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'fptoui_float_i32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'fptoui_float_i32' @@ -296,10 +311,10 @@ ; ; SLM-LABEL: 'fptoui_float_i32' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I32 = fptoui float undef to i32 diff --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll --- a/llvm/test/CodeGen/X86/concat-cast.ll +++ b/llvm/test/CodeGen/X86/concat-cast.ll @@ -109,91 +109,39 @@ } define <4 x i32> @fptoui_v4f32_v4i32(<2 x float> %x, <2 x float> %y) { -; SSE2-LABEL: fptoui_v4f32_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: cmpltps %xmm3, %xmm2 -; SSE2-NEXT: cvttps2dq %xmm0, %xmm4 -; SSE2-NEXT: subps %xmm3, %xmm0 -; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: xorps %xmm5, %xmm0 -; SSE2-NEXT: andps %xmm2, %xmm4 -; SSE2-NEXT: andnps %xmm0, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: cmpltps %xmm3, %xmm0 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm4 -; SSE2-NEXT: subps %xmm3, %xmm1 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: xorps %xmm5, %xmm1 -; SSE2-NEXT: andps %xmm0, %xmm4 -; SSE2-NEXT: andnps %xmm1, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE4-LABEL: fptoui_v4f32_v4i32: -; SSE4: # %bb.0: -; SSE4-NEXT: movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE4-NEXT: movaps %xmm0, %xmm2 -; SSE4-NEXT: cmpltps %xmm4, %xmm2 -; SSE4-NEXT: cvttps2dq %xmm0, %xmm5 -; SSE4-NEXT: subps %xmm4, %xmm0 -; SSE4-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE4-NEXT: movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSE4-NEXT: xorps %xmm6, %xmm3 -; SSE4-NEXT: movaps %xmm2, %xmm0 -; SSE4-NEXT: blendvps %xmm0, %xmm5, %xmm3 -; SSE4-NEXT: movaps %xmm1, %xmm0 -; SSE4-NEXT: cmpltps %xmm4, %xmm0 -; SSE4-NEXT: cvttps2dq %xmm1, %xmm2 -; SSE4-NEXT: subps %xmm4, %xmm1 -; SSE4-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE4-NEXT: xorps %xmm6, %xmm1 -; SSE4-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE4-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE4-NEXT: movaps %xmm3, %xmm0 -; SSE4-NEXT: retq +; SSE-LABEL: fptoui_v4f32_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: cvttps2dq %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_v4f32_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX1-NEXT: vcmpltps %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vsubps %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 -; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vxorps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm3, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vcmpltps %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vsubps %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vxorps %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_v4f32_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vcmpltps %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vcvttps2dq %xmm4, %xmm4 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorps %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm3, %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vcmpltps %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vsubps %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX2-NEXT: vxorps %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX2-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_v4f32_v4i32: @@ -316,82 +264,50 @@ } define <4 x i32> @fptoui_v4f64_v4i32(<2 x double> %x, <2 x double> %y) { -; SSE2-LABEL: fptoui_v4f64_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: cvttsd2si %xmm0, %rax -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: cvttsd2si %xmm0, %rcx -; SSE2-NEXT: cvttsd2si %xmm1, %rdx -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE2-NEXT: cvttsd2si %xmm1, %rsi -; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE4-LABEL: fptoui_v4f64_v4i32: -; SSE4: # %bb.0: -; SSE4-NEXT: cvttsd2si %xmm0, %rax -; SSE4-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE4-NEXT: cvttsd2si %xmm0, %rcx -; SSE4-NEXT: cvttsd2si %xmm1, %rdx -; SSE4-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE4-NEXT: cvttsd2si %xmm1, %rsi -; SSE4-NEXT: movd %eax, %xmm0 -; SSE4-NEXT: pinsrd $1, %ecx, %xmm0 -; SSE4-NEXT: pinsrd $2, %edx, %xmm0 -; SSE4-NEXT: pinsrd $3, %esi, %xmm0 -; SSE4-NEXT: retq +; SSE-LABEL: fptoui_v4f64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: movapd {{.*#+}} xmm2 = [2.147483648E+9,2.147483648E+9] +; SSE-NEXT: cvttpd2dq %xmm0, %xmm3 +; SSE-NEXT: subpd %xmm2, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm4 +; SSE-NEXT: movapd %xmm3, %xmm0 +; SSE-NEXT: psrad $31, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm1, %xmm3 +; SSE-NEXT: subpd %xmm2, %xmm1 +; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_v4f64_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX1-NEXT: vcmpltpd %ymm2, %ymm0, %ymm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vsubpd %ymm2, %ymm0, %ymm4 -; AVX1-NEXT: vcvttpd2dq %ymm4, %xmm4 -; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vxorpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm3, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vcmpltpd %ymm2, %ymm1, %ymm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vsubpd %ymm2, %ymm1, %ymm2 -; AVX1-NEXT: vcvttpd2dq %ymm2, %xmm2 -; AVX1-NEXT: vxorpd %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vandpd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vorpd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_v4f64_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vcmpltpd %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm4 -; AVX2-NEXT: vcvttpd2dq %ymm4, %xmm4 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorpd %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vsubpd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm3, %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vcmpltpd %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vsubpd %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vcvttpd2dq %ymm2, %xmm2 -; AVX2-NEXT: vxorpd %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vandpd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vorpd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll --- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll @@ -369,21 +369,21 @@ ; X86-SSE-LABEL: test_unsigned_i32_f32: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movaps %xmm0, %xmm2 -; X86-SSE-NEXT: subss %xmm1, %xmm2 -; X86-SSE-NEXT: cvttss2si %xmm2, %eax -; X86-SSE-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 -; X86-SSE-NEXT: cvttss2si %xmm0, %ecx -; X86-SSE-NEXT: ucomiss %xmm0, %xmm1 -; X86-SSE-NEXT: cmovbel %eax, %ecx -; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-NEXT: movl %eax, %ecx +; X86-SSE-NEXT: sarl $31, %ecx +; X86-SSE-NEXT: movaps %xmm0, %xmm1 +; X86-SSE-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: cvttss2si %xmm1, %edx +; X86-SSE-NEXT: andl %ecx, %edx +; X86-SSE-NEXT: orl %eax, %edx +; X86-SSE-NEXT: xorl %ecx, %ecx ; X86-SSE-NEXT: xorps %xmm1, %xmm1 ; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 -; X86-SSE-NEXT: cmovael %ecx, %edx +; X86-SSE-NEXT: cmovael %edx, %ecx ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movl $-1, %eax -; X86-SSE-NEXT: cmovbel %edx, %eax +; X86-SSE-NEXT: cmovbel %ecx, %eax ; X86-SSE-NEXT: retl ; ; X64-LABEL: test_unsigned_i32_f32: @@ -636,19 +636,18 @@ ; ; X64-LABEL: test_unsigned_i64_f32: ; X64: # %bb.0: -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: movaps %xmm0, %xmm2 -; X64-NEXT: subss %xmm1, %xmm2 -; X64-NEXT: cvttss2si %xmm2, %rax -; X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; X64-NEXT: xorq %rax, %rcx ; X64-NEXT: cvttss2si %xmm0, %rax -; X64-NEXT: ucomiss %xmm1, %xmm0 -; X64-NEXT: cmovaeq %rcx, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: sarq $63, %rcx +; X64-NEXT: movaps %xmm0, %xmm1 +; X64-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: cvttss2si %xmm1, %rdx +; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: ucomiss %xmm1, %xmm0 -; X64-NEXT: cmovaeq %rax, %rcx +; X64-NEXT: cmovaeq %rdx, %rcx ; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: movq $-1, %rax ; X64-NEXT: cmovbeq %rcx, %rax @@ -1309,13 +1308,12 @@ ; X86-SSE-NEXT: maxsd %xmm1, %xmm0 ; X86-SSE-NEXT: minsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: cvttsd2si %xmm0, %ecx -; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE-NEXT: movapd %xmm0, %xmm2 -; X86-SSE-NEXT: subsd %xmm1, %xmm2 -; X86-SSE-NEXT: cvttsd2si %xmm2, %eax -; X86-SSE-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 -; X86-SSE-NEXT: ucomisd %xmm1, %xmm0 -; X86-SSE-NEXT: cmovbl %ecx, %eax +; X86-SSE-NEXT: movl %ecx, %edx +; X86-SSE-NEXT: sarl $31, %edx +; X86-SSE-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: cvttsd2si %xmm0, %eax +; X86-SSE-NEXT: andl %edx, %eax +; X86-SSE-NEXT: orl %ecx, %eax ; X86-SSE-NEXT: retl ; ; X64-LABEL: test_unsigned_i32_f64: @@ -1562,19 +1560,18 @@ ; ; X64-LABEL: test_unsigned_i64_f64: ; X64: # %bb.0: -; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: movapd %xmm0, %xmm2 -; X64-NEXT: subsd %xmm1, %xmm2 -; X64-NEXT: cvttsd2si %xmm2, %rax -; X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; X64-NEXT: xorq %rax, %rcx ; X64-NEXT: cvttsd2si %xmm0, %rax -; X64-NEXT: ucomisd %xmm1, %xmm0 -; X64-NEXT: cmovaeq %rcx, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: sarq $63, %rcx +; X64-NEXT: movapd %xmm0, %xmm1 +; X64-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: cvttsd2si %xmm1, %rdx +; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: xorpd %xmm1, %xmm1 ; X64-NEXT: ucomisd %xmm1, %xmm0 -; X64-NEXT: cmovaeq %rax, %rcx +; X64-NEXT: cmovaeq %rdx, %rcx ; X64-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: movq $-1, %rax ; X64-NEXT: cmovbeq %rcx, %rax @@ -2298,21 +2295,21 @@ ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movaps %xmm0, %xmm2 -; X86-SSE-NEXT: subss %xmm1, %xmm2 -; X86-SSE-NEXT: cvttss2si %xmm2, %eax -; X86-SSE-NEXT: xorl $-2147483648, %eax # imm = 0x80000000 -; X86-SSE-NEXT: cvttss2si %xmm0, %ecx -; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 -; X86-SSE-NEXT: cmovael %eax, %ecx -; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-NEXT: movl %eax, %ecx +; X86-SSE-NEXT: sarl $31, %ecx +; X86-SSE-NEXT: movaps %xmm0, %xmm1 +; X86-SSE-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: cvttss2si %xmm1, %edx +; X86-SSE-NEXT: andl %ecx, %edx +; X86-SSE-NEXT: orl %eax, %edx +; X86-SSE-NEXT: xorl %ecx, %ecx ; X86-SSE-NEXT: xorps %xmm1, %xmm1 ; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 -; X86-SSE-NEXT: cmovael %ecx, %edx +; X86-SSE-NEXT: cmovael %edx, %ecx ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movl $-1, %eax -; X86-SSE-NEXT: cmovbel %edx, %eax +; X86-SSE-NEXT: cmovbel %ecx, %eax ; X86-SSE-NEXT: addl $12, %esp ; X86-SSE-NEXT: retl ; @@ -2589,19 +2586,18 @@ ; X64-NEXT: pushq %rax ; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: movaps %xmm0, %xmm2 -; X64-NEXT: subss %xmm1, %xmm2 -; X64-NEXT: cvttss2si %xmm2, %rax -; X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; X64-NEXT: xorq %rax, %rcx ; X64-NEXT: cvttss2si %xmm0, %rax -; X64-NEXT: ucomiss %xmm1, %xmm0 -; X64-NEXT: cmovaeq %rcx, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: sarq $63, %rcx +; X64-NEXT: movaps %xmm0, %xmm1 +; X64-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: cvttss2si %xmm1, %rdx +; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: ucomiss %xmm1, %xmm0 -; X64-NEXT: cmovaeq %rax, %rcx +; X64-NEXT: cmovaeq %rdx, %rcx ; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: movq $-1, %rax ; X64-NEXT: cmovbeq %rcx, %rax diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll --- a/llvm/test/CodeGen/X86/ftrunc.ll +++ b/llvm/test/CodeGen/X86/ftrunc.ll @@ -29,16 +29,14 @@ define double @trunc_unsigned_f64(double %x) #0 { ; SSE2-LABEL: trunc_unsigned_f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movapd %xmm0, %xmm2 -; SSE2-NEXT: subsd %xmm1, %xmm2 -; SSE2-NEXT: cvttsd2si %xmm2, %rax -; SSE2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE2-NEXT: xorq %rax, %rcx ; SSE2-NEXT: cvttsd2si %xmm0, %rax -; SSE2-NEXT: ucomisd %xmm1, %xmm0 -; SSE2-NEXT: cmovaeq %rcx, %rax -; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: cvttsd2si %xmm0, %rdx +; SSE2-NEXT: andq %rcx, %rdx +; SSE2-NEXT: orq %rax, %rdx +; SSE2-NEXT: movq %rdx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE2-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: movapd %xmm1, %xmm0 @@ -63,24 +61,20 @@ define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 { ; SSE2-LABEL: trunc_unsigned_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: cmpltps %xmm2, %xmm1 -; SSE2-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE2-NEXT: subps %xmm2, %xmm0 +; SSE2-NEXT: cvttps2dq %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: andps %xmm1, %xmm3 -; SSE2-NEXT: andnps %xmm0, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,65535,65535] -; SSE2-NEXT: andps %xmm1, %xmm0 -; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_unsigned_v4f32: @@ -104,21 +98,21 @@ ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: subsd %xmm2, %xmm1 ; SSE2-NEXT: cvttsd2si %xmm1, %rax -; SSE2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE2-NEXT: xorq %rcx, %rax -; SSE2-NEXT: cvttsd2si %xmm0, %rdx -; SSE2-NEXT: ucomisd %xmm2, %xmm0 -; SSE2-NEXT: cmovaeq %rax, %rdx +; SSE2-NEXT: cvttsd2si %xmm0, %rcx +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: sarq $63, %rdx +; SSE2-NEXT: andq %rax, %rdx +; SSE2-NEXT: orq %rcx, %rdx ; SSE2-NEXT: movq %rdx, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: movapd %xmm0, %xmm3 -; SSE2-NEXT: subsd %xmm2, %xmm3 -; SSE2-NEXT: cvttsd2si %xmm3, %rax -; SSE2-NEXT: xorq %rcx, %rax +; SSE2-NEXT: cvttsd2si %xmm0, %rax +; SSE2-NEXT: subsd %xmm2, %xmm0 ; SSE2-NEXT: cvttsd2si %xmm0, %rcx -; SSE2-NEXT: ucomisd %xmm2, %xmm0 -; SSE2-NEXT: cmovaeq %rax, %rcx -; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: movq %rax, %rdx +; SSE2-NEXT: sarq $63, %rdx +; SSE2-NEXT: andq %rcx, %rdx +; SSE2-NEXT: orq %rax, %rdx +; SSE2-NEXT: movq %rdx, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,4294967295] ; SSE2-NEXT: pand %xmm1, %xmm0 @@ -150,40 +144,41 @@ ; SSE2-NEXT: movapd %xmm1, %xmm2 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE2-NEXT: subsd %xmm3, %xmm1 -; SSE2-NEXT: cvttsd2si %xmm1, %rcx -; SSE2-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE2-NEXT: xorq %rax, %rcx -; SSE2-NEXT: cvttsd2si %xmm2, %rdx -; SSE2-NEXT: ucomisd %xmm3, %xmm2 -; SSE2-NEXT: cmovaeq %rcx, %rdx +; SSE2-NEXT: cvttsd2si %xmm1, %rax +; SSE2-NEXT: cvttsd2si %xmm2, %rcx +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: sarq $63, %rdx +; SSE2-NEXT: andq %rax, %rdx +; SSE2-NEXT: orq %rcx, %rdx ; SSE2-NEXT: movq %rdx, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: movapd %xmm2, %xmm4 -; SSE2-NEXT: subsd %xmm3, %xmm4 -; SSE2-NEXT: cvttsd2si %xmm4, %rcx -; SSE2-NEXT: xorq %rax, %rcx -; SSE2-NEXT: cvttsd2si %xmm2, %rdx -; SSE2-NEXT: ucomisd %xmm3, %xmm2 -; SSE2-NEXT: cmovaeq %rcx, %rdx +; SSE2-NEXT: cvttsd2si %xmm2, %rax +; SSE2-NEXT: subsd %xmm3, %xmm2 +; SSE2-NEXT: cvttsd2si %xmm2, %rcx +; SSE2-NEXT: movq %rax, %rdx +; SSE2-NEXT: sarq $63, %rdx +; SSE2-NEXT: andq %rcx, %rdx +; SSE2-NEXT: orq %rax, %rdx ; SSE2-NEXT: movq %rdx, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: movapd %xmm0, %xmm2 ; SSE2-NEXT: subsd %xmm3, %xmm2 -; SSE2-NEXT: cvttsd2si %xmm2, %rcx -; SSE2-NEXT: xorq %rax, %rcx -; SSE2-NEXT: cvttsd2si %xmm0, %rdx -; SSE2-NEXT: ucomisd %xmm3, %xmm0 -; SSE2-NEXT: cmovaeq %rcx, %rdx +; SSE2-NEXT: cvttsd2si %xmm2, %rax +; SSE2-NEXT: cvttsd2si %xmm0, %rcx +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: sarq $63, %rdx +; SSE2-NEXT: andq %rax, %rdx +; SSE2-NEXT: orq %rcx, %rdx ; SSE2-NEXT: movq %rdx, %xmm2 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: movapd %xmm0, %xmm4 -; SSE2-NEXT: subsd %xmm3, %xmm4 -; SSE2-NEXT: cvttsd2si %xmm4, %rcx -; SSE2-NEXT: xorq %rax, %rcx ; SSE2-NEXT: cvttsd2si %xmm0, %rax -; SSE2-NEXT: ucomisd %xmm3, %xmm0 -; SSE2-NEXT: cmovaeq %rcx, %rax -; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: subsd %xmm3, %xmm0 +; SSE2-NEXT: cvttsd2si %xmm0, %rcx +; SSE2-NEXT: movq %rax, %rdx +; SSE2-NEXT: sarq $63, %rdx +; SSE2-NEXT: andq %rcx, %rdx +; SSE2-NEXT: orq %rax, %rdx +; SSE2-NEXT: movq %rdx, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,4294967295] ; SSE2-NEXT: movdqa %xmm2, %xmm3 diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -269,15 +269,13 @@ ; CHECK-LIBCALL-NEXT: pushq %rax ; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-LIBCALL-NEXT: movaps %xmm0, %xmm2 -; CHECK-LIBCALL-NEXT: subss %xmm1, %xmm2 -; CHECK-LIBCALL-NEXT: cvttss2si %xmm2, %rax -; CHECK-LIBCALL-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; CHECK-LIBCALL-NEXT: xorq %rax, %rcx +; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rcx +; CHECK-LIBCALL-NEXT: movq %rcx, %rdx +; CHECK-LIBCALL-NEXT: sarq $63, %rdx +; CHECK-LIBCALL-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax -; CHECK-LIBCALL-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-LIBCALL-NEXT: cmovaeq %rcx, %rax +; CHECK-LIBCALL-NEXT: andq %rdx, %rax +; CHECK-LIBCALL-NEXT: orq %rcx, %rax ; CHECK-LIBCALL-NEXT: popq %rcx ; CHECK-LIBCALL-NEXT: retq ; @@ -286,14 +284,13 @@ ; BWON-F16C-NEXT: movzwl (%rdi), %eax ; BWON-F16C-NEXT: vmovd %eax, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; BWON-F16C-NEXT: vsubss %xmm1, %xmm0, %xmm2 -; BWON-F16C-NEXT: vcvttss2si %xmm2, %rax -; BWON-F16C-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; BWON-F16C-NEXT: xorq %rax, %rcx +; BWON-F16C-NEXT: vcvttss2si %xmm0, %rcx +; BWON-F16C-NEXT: movq %rcx, %rdx +; BWON-F16C-NEXT: sarq $63, %rdx +; BWON-F16C-NEXT: vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax -; BWON-F16C-NEXT: vucomiss %xmm1, %xmm0 -; BWON-F16C-NEXT: cmovaeq %rcx, %rax +; BWON-F16C-NEXT: andq %rdx, %rax +; BWON-F16C-NEXT: orq %rcx, %rax ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_fptoui_i64: diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll --- a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll @@ -38,31 +38,29 @@ ; X64-AVX512-NEXT: vcvttss2usi %xmm0, %eax ; X64-AVX512-NEXT: retq ; -; X86-SSE3-WIN-LABEL: f_to_u32: -; X86-SSE3-WIN: # %bb.0: -; X86-SSE3-WIN-NEXT: pushl %ebp -; X86-SSE3-WIN-NEXT: movl %esp, %ebp -; X86-SSE3-WIN-NEXT: andl $-8, %esp -; X86-SSE3-WIN-NEXT: subl $8, %esp -; X86-SSE3-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE3-WIN-NEXT: movss %xmm0, (%esp) -; X86-SSE3-WIN-NEXT: flds (%esp) -; X86-SSE3-WIN-NEXT: fisttpll (%esp) -; X86-SSE3-WIN-NEXT: movl (%esp), %eax -; X86-SSE3-WIN-NEXT: movl %ebp, %esp -; X86-SSE3-WIN-NEXT: popl %ebp -; X86-SSE3-WIN-NEXT: retl +; X86-SSE-WIN-LABEL: f_to_u32: +; X86-SSE-WIN: # %bb.0: +; X86-SSE-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-WIN-NEXT: cvttss2si %xmm0, %ecx +; X86-SSE-WIN-NEXT: movl %ecx, %edx +; X86-SSE-WIN-NEXT: sarl $31, %edx +; X86-SSE-WIN-NEXT: subss __real@4f000000, %xmm0 +; X86-SSE-WIN-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-WIN-NEXT: andl %edx, %eax +; X86-SSE-WIN-NEXT: orl %ecx, %eax +; X86-SSE-WIN-NEXT: retl ; -; X86-SSE3-LIN-LABEL: f_to_u32: -; X86-SSE3-LIN: # %bb.0: -; X86-SSE3-LIN-NEXT: subl $12, %esp -; X86-SSE3-LIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE3-LIN-NEXT: movss %xmm0, (%esp) -; X86-SSE3-LIN-NEXT: flds (%esp) -; X86-SSE3-LIN-NEXT: fisttpll (%esp) -; X86-SSE3-LIN-NEXT: movl (%esp), %eax -; X86-SSE3-LIN-NEXT: addl $12, %esp -; X86-SSE3-LIN-NEXT: retl +; X86-SSE-LIN-LABEL: f_to_u32: +; X86-SSE-LIN: # %bb.0: +; X86-SSE-LIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-LIN-NEXT: cvttss2si %xmm0, %ecx +; X86-SSE-LIN-NEXT: movl %ecx, %edx +; X86-SSE-LIN-NEXT: sarl $31, %edx +; X86-SSE-LIN-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-LIN-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-LIN-NEXT: andl %edx, %eax +; X86-SSE-LIN-NEXT: orl %ecx, %eax +; X86-SSE-LIN-NEXT: retl ; ; X64-SSE-LABEL: f_to_u32: ; X64-SSE: # %bb.0: @@ -70,32 +68,6 @@ ; X64-SSE-NEXT: # kill: def $eax killed $eax killed $rax ; X64-SSE-NEXT: retq ; -; X86-SSE2-LABEL: f_to_u32: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: movaps %xmm0, %xmm2 -; X86-SSE2-NEXT: subss %xmm1, %xmm2 -; X86-SSE2-NEXT: cvttss2si %xmm2, %ecx -; X86-SSE2-NEXT: xorl $-2147483648, %ecx # imm = 0x80000000 -; X86-SSE2-NEXT: cvttss2si %xmm0, %eax -; X86-SSE2-NEXT: ucomiss %xmm0, %xmm1 -; X86-SSE2-NEXT: cmovbel %ecx, %eax -; X86-SSE2-NEXT: retl -; -; X86-SSE1-LABEL: f_to_u32: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movaps %xmm0, %xmm2 -; X86-SSE1-NEXT: subss %xmm1, %xmm2 -; X86-SSE1-NEXT: cvttss2si %xmm2, %ecx -; X86-SSE1-NEXT: xorl $-2147483648, %ecx # imm = 0x80000000 -; X86-SSE1-NEXT: cvttss2si %xmm0, %eax -; X86-SSE1-NEXT: ucomiss %xmm0, %xmm1 -; X86-SSE1-NEXT: cmovbel %ecx, %eax -; X86-SSE1-NEXT: retl -; ; X87-WIN-LABEL: f_to_u32: ; X87-WIN: # %bb.0: ; X87-WIN-NEXT: pushl %ebp @@ -185,28 +157,26 @@ ; ; X86-SSE3-WIN-LABEL: d_to_u32: ; X86-SSE3-WIN: # %bb.0: -; X86-SSE3-WIN-NEXT: pushl %ebp -; X86-SSE3-WIN-NEXT: movl %esp, %ebp -; X86-SSE3-WIN-NEXT: andl $-8, %esp -; X86-SSE3-WIN-NEXT: subl $8, %esp ; X86-SSE3-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE3-WIN-NEXT: movsd %xmm0, (%esp) -; X86-SSE3-WIN-NEXT: fldl (%esp) -; X86-SSE3-WIN-NEXT: fisttpll (%esp) -; X86-SSE3-WIN-NEXT: movl (%esp), %eax -; X86-SSE3-WIN-NEXT: movl %ebp, %esp -; X86-SSE3-WIN-NEXT: popl %ebp +; X86-SSE3-WIN-NEXT: cvttsd2si %xmm0, %ecx +; X86-SSE3-WIN-NEXT: movl %ecx, %edx +; X86-SSE3-WIN-NEXT: sarl $31, %edx +; X86-SSE3-WIN-NEXT: subsd __real@41e0000000000000, %xmm0 +; X86-SSE3-WIN-NEXT: cvttsd2si %xmm0, %eax +; X86-SSE3-WIN-NEXT: andl %edx, %eax +; X86-SSE3-WIN-NEXT: orl %ecx, %eax ; X86-SSE3-WIN-NEXT: retl ; ; X86-SSE3-LIN-LABEL: d_to_u32: ; X86-SSE3-LIN: # %bb.0: -; X86-SSE3-LIN-NEXT: subl $12, %esp ; X86-SSE3-LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE3-LIN-NEXT: movsd %xmm0, (%esp) -; X86-SSE3-LIN-NEXT: fldl (%esp) -; X86-SSE3-LIN-NEXT: fisttpll (%esp) -; X86-SSE3-LIN-NEXT: movl (%esp), %eax -; X86-SSE3-LIN-NEXT: addl $12, %esp +; X86-SSE3-LIN-NEXT: cvttsd2si %xmm0, %ecx +; X86-SSE3-LIN-NEXT: movl %ecx, %edx +; X86-SSE3-LIN-NEXT: sarl $31, %edx +; X86-SSE3-LIN-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE3-LIN-NEXT: cvttsd2si %xmm0, %eax +; X86-SSE3-LIN-NEXT: andl %edx, %eax +; X86-SSE3-LIN-NEXT: orl %ecx, %eax ; X86-SSE3-LIN-NEXT: retl ; ; X64-SSE-LABEL: d_to_u32: @@ -215,18 +185,29 @@ ; X64-SSE-NEXT: # kill: def $eax killed $eax killed $rax ; X64-SSE-NEXT: retq ; -; X86-SSE2-LABEL: d_to_u32: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE2-NEXT: movapd %xmm0, %xmm2 -; X86-SSE2-NEXT: subsd %xmm1, %xmm2 -; X86-SSE2-NEXT: cvttsd2si %xmm2, %ecx -; X86-SSE2-NEXT: xorl $-2147483648, %ecx # imm = 0x80000000 -; X86-SSE2-NEXT: cvttsd2si %xmm0, %eax -; X86-SSE2-NEXT: ucomisd %xmm0, %xmm1 -; X86-SSE2-NEXT: cmovbel %ecx, %eax -; X86-SSE2-NEXT: retl +; X86-SSE2-WIN-LABEL: d_to_u32: +; X86-SSE2-WIN: # %bb.0: +; X86-SSE2-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-WIN-NEXT: cvttsd2si %xmm0, %ecx +; X86-SSE2-WIN-NEXT: movl %ecx, %edx +; X86-SSE2-WIN-NEXT: sarl $31, %edx +; X86-SSE2-WIN-NEXT: subsd __real@41e0000000000000, %xmm0 +; X86-SSE2-WIN-NEXT: cvttsd2si %xmm0, %eax +; X86-SSE2-WIN-NEXT: andl %edx, %eax +; X86-SSE2-WIN-NEXT: orl %ecx, %eax +; X86-SSE2-WIN-NEXT: retl +; +; X86-SSE2-LIN-LABEL: d_to_u32: +; X86-SSE2-LIN: # %bb.0: +; X86-SSE2-LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-LIN-NEXT: cvttsd2si %xmm0, %ecx +; X86-SSE2-LIN-NEXT: movl %ecx, %edx +; X86-SSE2-LIN-NEXT: sarl $31, %edx +; X86-SSE2-LIN-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-LIN-NEXT: cvttsd2si %xmm0, %eax +; X86-SSE2-LIN-NEXT: andl %edx, %eax +; X86-SSE2-LIN-NEXT: orl %ecx, %eax +; X86-SSE2-LIN-NEXT: retl ; ; X86-SSE1-WIN-LABEL: d_to_u32: ; X86-SSE1-WIN: # %bb.0: diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll --- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll @@ -153,18 +153,27 @@ ; X86-SSE3-LIN-NEXT: addl $12, %esp ; X86-SSE3-LIN-NEXT: retl ; -; X64-SSE-LABEL: f_to_u64: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movaps %xmm0, %xmm2 -; X64-SSE-NEXT: subss %xmm1, %xmm2 -; X64-SSE-NEXT: cvttss2si %xmm2, %rax -; X64-SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; X64-SSE-NEXT: xorq %rax, %rcx -; X64-SSE-NEXT: cvttss2si %xmm0, %rax -; X64-SSE-NEXT: ucomiss %xmm1, %xmm0 -; X64-SSE-NEXT: cmovaeq %rcx, %rax -; X64-SSE-NEXT: retq +; X64-SSE-WIN-LABEL: f_to_u64: +; X64-SSE-WIN: # %bb.0: +; X64-SSE-WIN-NEXT: cvttss2si %xmm0, %rcx +; X64-SSE-WIN-NEXT: movq %rcx, %rdx +; X64-SSE-WIN-NEXT: sarq $63, %rdx +; X64-SSE-WIN-NEXT: subss __real@5f000000(%rip), %xmm0 +; X64-SSE-WIN-NEXT: cvttss2si %xmm0, %rax +; X64-SSE-WIN-NEXT: andq %rdx, %rax +; X64-SSE-WIN-NEXT: orq %rcx, %rax +; X64-SSE-WIN-NEXT: retq +; +; X64-SSE-LIN-LABEL: f_to_u64: +; X64-SSE-LIN: # %bb.0: +; X64-SSE-LIN-NEXT: cvttss2si %xmm0, %rcx +; X64-SSE-LIN-NEXT: movq %rcx, %rdx +; X64-SSE-LIN-NEXT: sarq $63, %rdx +; X64-SSE-LIN-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-LIN-NEXT: cvttss2si %xmm0, %rax +; X64-SSE-LIN-NEXT: andq %rdx, %rax +; X64-SSE-LIN-NEXT: orq %rcx, %rax +; X64-SSE-LIN-NEXT: retq ; ; X86-SSE2-WIN-LABEL: f_to_u64: ; X86-SSE2-WIN: # %bb.0: @@ -577,18 +586,27 @@ ; X86-SSE3-LIN-NEXT: addl $12, %esp ; X86-SSE3-LIN-NEXT: retl ; -; X64-SSE-LABEL: d_to_u64: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X64-SSE-NEXT: movapd %xmm0, %xmm2 -; X64-SSE-NEXT: subsd %xmm1, %xmm2 -; X64-SSE-NEXT: cvttsd2si %xmm2, %rax -; X64-SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; X64-SSE-NEXT: xorq %rax, %rcx -; X64-SSE-NEXT: cvttsd2si %xmm0, %rax -; X64-SSE-NEXT: ucomisd %xmm1, %xmm0 -; X64-SSE-NEXT: cmovaeq %rcx, %rax -; X64-SSE-NEXT: retq +; X64-SSE-WIN-LABEL: d_to_u64: +; X64-SSE-WIN: # %bb.0: +; X64-SSE-WIN-NEXT: cvttsd2si %xmm0, %rcx +; X64-SSE-WIN-NEXT: movq %rcx, %rdx +; X64-SSE-WIN-NEXT: sarq $63, %rdx +; X64-SSE-WIN-NEXT: subsd __real@43e0000000000000(%rip), %xmm0 +; X64-SSE-WIN-NEXT: cvttsd2si %xmm0, %rax +; X64-SSE-WIN-NEXT: andq %rdx, %rax +; X64-SSE-WIN-NEXT: orq %rcx, %rax +; X64-SSE-WIN-NEXT: retq +; +; X64-SSE-LIN-LABEL: d_to_u64: +; X64-SSE-LIN: # %bb.0: +; X64-SSE-LIN-NEXT: cvttsd2si %xmm0, %rcx +; X64-SSE-LIN-NEXT: movq %rcx, %rdx +; X64-SSE-LIN-NEXT: sarq $63, %rdx +; X64-SSE-LIN-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-LIN-NEXT: cvttsd2si %xmm0, %rax +; X64-SSE-LIN-NEXT: andq %rdx, %rax +; X64-SSE-LIN-NEXT: orq %rcx, %rax +; X64-SSE-LIN-NEXT: retq ; ; X86-SSE2-WIN-LABEL: d_to_u64: ; X86-SSE2-WIN: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vec_cast3.ll b/llvm/test/CodeGen/X86/vec_cast3.ll --- a/llvm/test/CodeGen/X86/vec_cast3.ll +++ b/llvm/test/CodeGen/X86/vec_cast3.ll @@ -117,13 +117,12 @@ define <2 x i32> @cvt_v2f32_v2u32(<2 x float> %src) { ; CHECK-LABEL: cvt_v2f32_v2u32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; CHECK-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: vcvttps2dq %xmm1, %xmm1 -; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; CHECK-NEXT: vcvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: vpsrad $31, %xmm1, %xmm2 +; CHECK-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retl %res = fptoui <2 x float> %src to <2 x i32> ret <2 x i32> %res diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -254,21 +254,21 @@ ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: subsd %xmm2, %xmm1 ; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: movq %rcx, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rax, %rdx +; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: subsd %xmm2, %xmm0 ; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -278,20 +278,21 @@ ; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2 ; VEX-NEXT: vcvttsd2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rdx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx +; VEX-NEXT: vcvttsd2si %xmm0, %rcx +; VEX-NEXT: movq %rcx, %rdx +; VEX-NEXT: sarq $63, %rdx +; VEX-NEXT: andq %rax, %rdx +; VEX-NEXT: orq %rcx, %rdx ; VEX-NEXT: vmovq %rdx, %xmm2 ; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttsd2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm1 +; VEX-NEXT: vcvttsd2si %xmm1, %rax ; VEX-NEXT: vcvttsd2si %xmm0, %rcx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: movq %rcx, %rdx +; VEX-NEXT: sarq $63, %rdx +; VEX-NEXT: andq %rax, %rdx +; VEX-NEXT: orq %rcx, %rdx +; VEX-NEXT: vmovq %rdx, %xmm0 ; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; VEX-NEXT: retq ; @@ -334,45 +335,24 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: cvttpd2dq %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: andpd %xmm2, %xmm0 +; SSE-NEXT: orpd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: fptoui_2f64_to_4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_2f64_to_4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; VEX-LABEL: fptoui_2f64_to_4i32: +; VEX: # %bb.0: +; VEX-NEXT: vcvttpd2dq %xmm0, %xmm1 +; VEX-NEXT: vpsrad $31, %xmm1, %xmm2 +; VEX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; VEX-NEXT: vandpd %xmm2, %xmm0, %xmm0 +; VEX-NEXT: vorpd %xmm0, %xmm1, %xmm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_4i32: ; AVX512F: # %bb.0: @@ -407,45 +387,24 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: andpd %xmm2, %xmm0 +; SSE-NEXT: orpd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: fptoui_2f64_to_2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_2f64_to_2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; VEX-LABEL: fptoui_2f64_to_2i32: +; VEX: # %bb.0: +; VEX-NEXT: vcvttpd2dq %xmm0, %xmm1 +; VEX-NEXT: vpsrad $31, %xmm1, %xmm2 +; VEX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; VEX-NEXT: vandpd %xmm2, %xmm0, %xmm0 +; VEX-NEXT: vorpd %xmm0, %xmm1, %xmm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_2i32: ; AVX512F: # %bb.0: @@ -480,27 +439,24 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; SSE-NEXT: cvttpd2dq %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: andpd %xmm2, %xmm0 +; SSE-NEXT: orpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_4f64_to_2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovapd %xmm0, %xmm0 -; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm1 -; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandpd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vorpd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -508,15 +464,12 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovapd %xmm0, %xmm0 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vandpd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vorpd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -560,40 +513,41 @@ ; SSE-NEXT: movapd %xmm0, %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: subsd %xmm3, %xmm0 -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm2, %rdx -; SSE-NEXT: ucomisd %xmm3, %xmm2 -; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: cvttsd2si %xmm2, %rcx +; SSE-NEXT: movq %rcx, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rax, %rdx +; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: movq %rdx, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: movapd %xmm2, %xmm4 -; SSE-NEXT: subsd %xmm3, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm2, %rdx -; SSE-NEXT: ucomisd %xmm3, %xmm2 -; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: cvttsd2si %xmm2, %rax +; SSE-NEXT: subsd %xmm3, %xmm2 +; SSE-NEXT: cvttsd2si %xmm2, %rcx +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx ; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movapd %xmm1, %xmm2 ; SSE-NEXT: subsd %xmm3, %xmm2 -; SSE-NEXT: cvttsd2si %xmm2, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm3, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: cvttsd2si %xmm2, %rax +; SSE-NEXT: cvttsd2si %xmm1, %rcx +; SSE-NEXT: movq %rcx, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rax, %rdx +; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm1, %xmm4 -; SSE-NEXT: subsd %xmm3, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: ucomisd %xmm3, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: subsd %xmm3, %xmm1 +; SSE-NEXT: cvttsd2si %xmm1, %rcx +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: retq @@ -604,36 +558,39 @@ ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vcvttsd2si %xmm3, %rax -; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttsd2si %xmm2, %rdx -; AVX1-NEXT: vucomisd %xmm1, %xmm2 -; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vcvttsd2si %xmm2, %rcx +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: andq %rax, %rdx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm4 ; AVX1-NEXT: vcvttsd2si %xmm4, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttsd2si %xmm2, %rdx -; AVX1-NEXT: vucomisd %xmm1, %xmm2 -; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vcvttsd2si %xmm2, %rcx +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: andq %rax, %rdx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm2 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vcvttsd2si %xmm3, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttsd2si %xmm0, %rdx -; AVX1-NEXT: vucomisd %xmm1, %xmm0 -; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vcvttsd2si %xmm0, %rcx +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: andq %rax, %rdx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vcvttsd2si %xmm4, %rax -; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcvttsd2si %xmm1, %rax ; AVX1-NEXT: vcvttsd2si %xmm0, %rcx -; AVX1-NEXT: vucomisd %xmm1, %xmm0 -; AVX1-NEXT: cmovaeq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: andq %rax, %rdx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -644,36 +601,39 @@ ; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm3 ; AVX2-NEXT: vcvttsd2si %xmm3, %rax -; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttsd2si %xmm2, %rdx -; AVX2-NEXT: vucomisd %xmm1, %xmm2 -; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vcvttsd2si %xmm2, %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: andq %rax, %rdx +; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm4 ; AVX2-NEXT: vcvttsd2si %xmm4, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttsd2si %xmm2, %rdx -; AVX2-NEXT: vucomisd %xmm1, %xmm2 -; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vcvttsd2si %xmm2, %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: andq %rax, %rdx +; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm2 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm3 ; AVX2-NEXT: vcvttsd2si %xmm3, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttsd2si %xmm0, %rdx -; AVX2-NEXT: vucomisd %xmm1, %xmm0 -; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vcvttsd2si %xmm0, %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: andq %rax, %rdx +; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm4 -; AVX2-NEXT: vcvttsd2si %xmm4, %rax -; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vcvttsd2si %xmm1, %rax ; AVX2-NEXT: vcvttsd2si %xmm0, %rcx -; AVX2-NEXT: vucomisd %xmm1, %xmm0 -; AVX2-NEXT: cmovaeq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: andq %rax, %rdx +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -732,48 +692,44 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movapd {{.*#+}} xmm2 = [2.147483648E+9,2.147483648E+9] +; SSE-NEXT: cvttpd2dq %xmm1, %xmm3 +; SSE-NEXT: subpd %xmm2, %xmm1 +; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 +; SSE-NEXT: movapd %xmm3, %xmm4 +; SSE-NEXT: psrad $31, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm1 +; SSE-NEXT: subpd %xmm2, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm2 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: psrad $31, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_4f64_to_4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm1 -; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandpd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vorpd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_4f64_to_4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vandpd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vorpd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1199,40 +1155,34 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) { ; SSE-LABEL: fptoui_2f32_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: cmpltps %xmm2, %xmm1 -; SSE-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE-NEXT: subps %xmm2, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_2f32_to_2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_2f32_to_2i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f32_to_2i32: @@ -1267,40 +1217,34 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) { ; SSE-LABEL: fptoui_4f32_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: cmpltps %xmm2, %xmm1 -; SSE-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE-NEXT: subps %xmm2, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_4f32_to_4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_4f32_to_4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_4f32_to_4i32: @@ -1339,21 +1283,21 @@ ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: subss %xmm2, %xmm1 ; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rdx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: movq %rcx, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rax, %rdx +; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subss %xmm2, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: subss %xmm2, %xmm0 ; SSE-NEXT: cvttss2si %xmm0, %rcx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1363,20 +1307,21 @@ ; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2 ; VEX-NEXT: vcvttss2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rdx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx +; VEX-NEXT: vcvttss2si %xmm0, %rcx +; VEX-NEXT: movq %rcx, %rdx +; VEX-NEXT: sarq $63, %rdx +; VEX-NEXT: andq %rax, %rdx +; VEX-NEXT: orq %rcx, %rdx ; VEX-NEXT: vmovq %rdx, %xmm2 ; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttss2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm1 +; VEX-NEXT: vcvttss2si %xmm1, %rax ; VEX-NEXT: vcvttss2si %xmm0, %rcx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: movq %rcx, %rdx +; VEX-NEXT: sarq $63, %rdx +; VEX-NEXT: andq %rax, %rdx +; VEX-NEXT: orq %rcx, %rdx +; VEX-NEXT: vmovq %rdx, %xmm0 ; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; VEX-NEXT: retq ; @@ -1424,21 +1369,21 @@ ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: subss %xmm2, %xmm1 ; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rdx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: movq %rcx, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rax, %rdx +; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subss %xmm2, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: subss %xmm2, %xmm0 ; SSE-NEXT: cvttss2si %xmm0, %rcx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1449,18 +1394,19 @@ ; VEX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; VEX-NEXT: vsubss %xmm2, %xmm1, %xmm3 ; VEX-NEXT: vcvttss2si %xmm3, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm1, %rdx -; VEX-NEXT: vucomiss %xmm2, %xmm1 -; VEX-NEXT: cmovaeq %rax, %rdx +; VEX-NEXT: vcvttss2si %xmm1, %rcx +; VEX-NEXT: movq %rcx, %rdx +; VEX-NEXT: sarq $63, %rdx +; VEX-NEXT: andq %rax, %rdx +; VEX-NEXT: orq %rcx, %rdx ; VEX-NEXT: vsubss %xmm2, %xmm0, %xmm1 ; VEX-NEXT: vcvttss2si %xmm1, %rax -; VEX-NEXT: xorq %rcx, %rax ; VEX-NEXT: vcvttss2si %xmm0, %rcx -; VEX-NEXT: vucomiss %xmm2, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: movq %rcx, %rsi +; VEX-NEXT: sarq $63, %rsi +; VEX-NEXT: andq %rax, %rsi +; VEX-NEXT: orq %rcx, %rsi +; VEX-NEXT: vmovq %rsi, %xmm0 ; VEX-NEXT: vmovq %rdx, %xmm1 ; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; VEX-NEXT: retq @@ -1507,51 +1453,41 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) { ; SSE-LABEL: fptoui_8f32_to_8i32: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: cmpltps %xmm4, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; SSE-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE-NEXT: subps %xmm4, %xmm0 -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE-NEXT: xorps %xmm5, %xmm0 -; SSE-NEXT: andps %xmm2, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: orps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: cmpltps %xmm4, %xmm3 -; SSE-NEXT: cvttps2dq %xmm1, %xmm0 -; SSE-NEXT: subps %xmm4, %xmm1 -; SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE-NEXT: xorps %xmm5, %xmm1 -; SSE-NEXT: andps %xmm3, %xmm0 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: subps %xmm2, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: psrad $31, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: cvttps2dq %xmm1, %xmm3 +; SSE-NEXT: subps %xmm2, %xmm1 +; SSE-NEXT: cvttps2dq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_8f32_to_8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vsubps %ymm1, %ymm0, %ymm1 -; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vcvttps2dq %ymm0, %ymm1 +; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vblendvps %ymm1, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_8f32_to_8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorps %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_8f32_to_8i32: @@ -1587,43 +1523,43 @@ ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: subss %xmm1, %xmm2 -; SSE-NEXT: cvttss2si %xmm2, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm0, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: cvttss2si %xmm2, %rax +; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: movq %rcx, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rax, %rdx +; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm3, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm3 -; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: subss %xmm1, %xmm3 +; SSE-NEXT: cvttss2si %xmm3, %rcx +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx ; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm3, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm3 -; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: subss %xmm1, %xmm3 +; SSE-NEXT: cvttss2si %xmm3, %rcx +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx ; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: ucomiss %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq @@ -1634,36 +1570,39 @@ ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vcvttss2si %xmm3, %rax -; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm2, %rdx -; AVX1-NEXT: vucomiss %xmm1, %xmm2 -; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vcvttss2si %xmm2, %rcx +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: andq %rax, %rdx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm2 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vcvttss2si %xmm4, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm3, %rdx -; AVX1-NEXT: vucomiss %xmm1, %xmm3 -; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vcvttss2si %xmm3, %rcx +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: andq %rax, %rdx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vcvttss2si %xmm3, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm0, %rdx -; AVX1-NEXT: vucomiss %xmm1, %xmm0 -; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: andq %rax, %rdx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vcvttss2si %xmm4, %rax -; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcvttss2si %xmm1, %rax ; AVX1-NEXT: vcvttss2si %xmm0, %rcx -; AVX1-NEXT: vucomiss %xmm1, %xmm0 -; AVX1-NEXT: cmovaeq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: andq %rax, %rdx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1674,36 +1613,39 @@ ; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 ; AVX2-NEXT: vcvttss2si %xmm3, %rax -; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm2, %rdx -; AVX2-NEXT: vucomiss %xmm1, %xmm2 -; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vcvttss2si %xmm2, %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: andq %rax, %rdx +; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm2 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 ; AVX2-NEXT: vcvttss2si %xmm4, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm3, %rdx -; AVX2-NEXT: vucomiss %xmm1, %xmm3 -; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vcvttss2si %xmm3, %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: andq %rax, %rdx +; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 ; AVX2-NEXT: vcvttss2si %xmm3, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm0, %rdx -; AVX2-NEXT: vucomiss %xmm1, %xmm0 -; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vcvttss2si %xmm0, %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: andq %rax, %rdx +; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4 -; AVX2-NEXT: vcvttss2si %xmm4, %rax -; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vcvttss2si %xmm1, %rax ; AVX2-NEXT: vcvttss2si %xmm0, %rcx -; AVX2-NEXT: vucomiss %xmm1, %xmm0 -; AVX2-NEXT: cmovaeq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: andq %rax, %rdx +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1765,43 +1707,43 @@ ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: subss %xmm1, %xmm2 -; SSE-NEXT: cvttss2si %xmm2, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm0, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: cvttss2si %xmm2, %rax +; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: movq %rcx, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rax, %rdx +; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm3, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm3 -; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: subss %xmm1, %xmm3 +; SSE-NEXT: cvttss2si %xmm3, %rcx +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx ; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm3, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm3 -; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: subss %xmm1, %xmm3 +; SSE-NEXT: cvttss2si %xmm3, %rcx +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx ; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: ucomiss %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq @@ -1812,36 +1754,39 @@ ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vcvttss2si %xmm3, %rax -; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm2, %rdx -; AVX1-NEXT: vucomiss %xmm1, %xmm2 -; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vcvttss2si %xmm2, %rcx +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: andq %rax, %rdx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm2 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vcvttss2si %xmm4, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm3, %rdx -; AVX1-NEXT: vucomiss %xmm1, %xmm3 -; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vcvttss2si %xmm3, %rcx +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: andq %rax, %rdx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vcvttss2si %xmm3, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm0, %rdx -; AVX1-NEXT: vucomiss %xmm1, %xmm0 -; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: andq %rax, %rdx +; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vcvttss2si %xmm4, %rax -; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcvttss2si %xmm1, %rax ; AVX1-NEXT: vcvttss2si %xmm0, %rcx -; AVX1-NEXT: vucomiss %xmm1, %xmm0 -; AVX1-NEXT: cmovaeq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: andq %rax, %rdx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1852,36 +1797,39 @@ ; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 ; AVX2-NEXT: vcvttss2si %xmm3, %rax -; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm2, %rdx -; AVX2-NEXT: vucomiss %xmm1, %xmm2 -; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vcvttss2si %xmm2, %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: andq %rax, %rdx +; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm2 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 ; AVX2-NEXT: vcvttss2si %xmm4, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm3, %rdx -; AVX2-NEXT: vucomiss %xmm1, %xmm3 -; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vcvttss2si %xmm3, %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: andq %rax, %rdx +; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 ; AVX2-NEXT: vcvttss2si %xmm3, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm0, %rdx -; AVX2-NEXT: vucomiss %xmm1, %xmm0 -; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vcvttss2si %xmm0, %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: andq %rax, %rdx +; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4 -; AVX2-NEXT: vcvttss2si %xmm4, %rax -; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vcvttss2si %xmm1, %rax ; AVX2-NEXT: vcvttss2si %xmm0, %rcx -; AVX2-NEXT: vucomiss %xmm1, %xmm0 -; AVX2-NEXT: cmovaeq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: andq %rax, %rdx +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -2807,21 +2755,21 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: subss %xmm2, %xmm0 ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm1, %rdx -; SSE-NEXT: ucomiss %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: cvttss2si %xmm1, %rcx +; SSE-NEXT: movq %rcx, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rax, %rdx +; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: movq %rdx, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: subss %xmm2, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: subss %xmm2, %xmm1 ; SSE-NEXT: cvttss2si %xmm1, %rcx -; SSE-NEXT: ucomiss %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; @@ -2831,20 +2779,21 @@ ; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2 ; VEX-NEXT: vcvttss2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rdx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx +; VEX-NEXT: vcvttss2si %xmm0, %rcx +; VEX-NEXT: movq %rcx, %rdx +; VEX-NEXT: sarq $63, %rdx +; VEX-NEXT: andq %rax, %rdx +; VEX-NEXT: orq %rcx, %rdx ; VEX-NEXT: vmovq %rdx, %xmm2 ; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttss2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm1 +; VEX-NEXT: vcvttss2si %xmm1, %rax ; VEX-NEXT: vcvttss2si %xmm0, %rcx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: movq %rcx, %rdx +; VEX-NEXT: sarq $63, %rdx +; VEX-NEXT: andq %rax, %rdx +; VEX-NEXT: orq %rcx, %rdx +; VEX-NEXT: vmovq %rdx, %xmm0 ; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; VEX-NEXT: retq ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ,AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ,XOP -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX256DQ +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256DQ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -126,50 +126,11 @@ ; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; -; AVX1-LABEL: @fptoui_8f64_8i32( -; AVX1-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX1-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX1-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX1-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX1-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX1-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX1-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX1-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX1-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i32 -; AVX1-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i32 -; AVX1-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i32 -; AVX1-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i32 -; AVX1-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i32 -; AVX1-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i32 -; AVX1-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i32 -; AVX1-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i32 -; AVX1-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX1-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX1-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX1-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; AVX1-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; AVX1-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; AVX1-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; AVX1-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 -; AVX1-NEXT: ret void -; -; XOP-LABEL: @fptoui_8f64_8i32( -; XOP-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; XOP-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32> -; XOP-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 -; XOP-NEXT: ret void -; -; AVX2-LABEL: @fptoui_8f64_8i32( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32> -; AVX2-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 -; AVX2-NEXT: ret void -; -; AVX512-LABEL: @fptoui_8f64_8i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32> -; AVX512-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 -; AVX512-NEXT: ret void +; AVX-LABEL: @fptoui_8f64_8i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32> +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 +; AVX-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8