Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -973,6 +973,7 @@ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); @@ -1328,6 +1329,8 @@ } if (HasInt256) { + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom); + // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); @@ -21073,10 +21076,10 @@ // Widen vXi32 fp_to_uint with avx512f to 512-bit source. if ((VT == MVT::v4i32 || VT == MVT::v8i32) && - (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) { + (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) && + Subtarget.useAVX512Regs()) { assert(!IsSigned && "Expected unsigned conversion!"); - assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && - "Unexpected features!"); + assert(!Subtarget.hasVLX() && "Unexpected features!"); MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32; MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32; // Need to concat with zero vector for strict fp to avoid spurious @@ -21106,9 +21109,9 @@ // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source. if ((VT == MVT::v2i64 || VT == MVT::v4i64) && - (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) { - assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() && - !Subtarget.hasVLX() && "Unexpected features!"); + (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) && + Subtarget.useAVX512Regs() && Subtarget.hasDQI()) { + assert(!Subtarget.hasVLX() && "Unexpected features!"); MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; // Need to concat with zero vector for strict fp to avoid spurious // exceptions. @@ -21135,7 +21138,7 @@ return Res; } - if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { + if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { if (!Subtarget.hasVLX()) { // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type // legalizer and then widened again by vector op legalization. @@ -21167,6 +21170,36 @@ return DAG.getNode(Opc, dl, VT, Tmp); } + // Generate optimized instructions for pre AVX512 unsigned conversions from + // vXf32 to vXi32. + if (VT == MVT::v4i32 && SrcVT == MVT::v4f32 || + VT == MVT::v8i32 && SrcVT == MVT::v8f32) + { + assert(!IsSigned && "Expected unsigned conversion!"); + assert(Subtarget.hasSSE2() && (VT != MVT::v8i32 || Subtarget.hasAVX2()) + && "Unexpected features!"); + // We can leverage the specific way the "cvttps2dq" instruction behaves + // on out of range inputs to generate optimized conversions. + + // Calculate the converted result for values in the range 0 to + // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big"). + const SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src); + const SDValue Big = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, + DAG.getNode(ISD::FSUB, dl, SrcVT, Src, + DAG.getConstantFP(2147483648.0f, dl, SrcVT))); + + // The "CVTTP2SI" instruction conveniently sets the sign bit if + // and only if the value was out of range. So we can use that + // as our indicator that we rather use "Big" instead of "Small". + const SDValue IsOverflown = DAG.getNode(X86ISD::VSRAI, dl, VT, Small, + DAG.getTargetConstant(31, dl, MVT::i8)); + + // Use "Small" if "IsOverflown" has all bits cleared + // and "0x80000000 | Big" if all bits in "IsOverflown" are set. + return DAG.getNode(ISD::OR, dl, VT, Small, + DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown)); + } + return SDValue(); } Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1761,6 +1761,10 @@ { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, + + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8 }, + { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 16 }, + { ISD::FP_TO_UINT, MVT::v32i32, MVT::v32f32, 32 } }; static const TypeConversionCostTblEntry AVXConversionTbl[] = { @@ -1845,12 +1849,16 @@ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 }, { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 }, { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 }, + + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8 }, + { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 16 }, + { ISD::FP_TO_UINT, MVT::v32i32, MVT::v32f32, 32 }, + // This node is expanded into scalarized operations but BasicTTI is overly // optimistic estimating its cost. It computes 3 per element (one // vector-extract, one scalar conversion and one vector-insert). The // problem is that the inserts form a read-modify-write chain so latency // should be factored in too. Inflating the cost per element by 1. - { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, @@ -1908,6 +1916,7 @@ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 3 }, { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 3 }, { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 6 } }; static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { @@ -1953,6 +1962,9 @@ { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 }, { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 }, + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 8 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 }, + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 6 }, { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, Index: llvm/test/Analysis/CostModel/X86/fptoui.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/fptoui.ll +++ llvm/test/Analysis/CostModel/X86/fptoui.ll @@ -63,9 +63,9 @@ define i32 @fptoui_double_i32(i32 %arg) { ; SSE2-LABEL: 'fptoui_double_i32' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'fptoui_double_i32' @@ -91,9 +91,9 @@ ; ; SLM-LABEL: 'fptoui_double_i32' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32 -; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I32 = fptoui double undef to i32 @@ -236,25 +236,25 @@ ; SSE2-LABEL: 'fptoui_float_i32' ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> -; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'fptoui_float_i32' ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> -; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'fptoui_float_i32' ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'fptoui_float_i32' @@ -267,10 +267,10 @@ ; ; SLM-LABEL: 'fptoui_float_i32' ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32 -; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> -; SLM-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32> +; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I32 = fptoui float undef to i32 Index: llvm/test/CodeGen/X86/concat-cast.ll =================================================================== --- llvm/test/CodeGen/X86/concat-cast.ll +++ llvm/test/CodeGen/X86/concat-cast.ll @@ -109,91 +109,39 @@ } define <4 x i32> @fptoui_v4f32_v4i32(<2 x float> %x, <2 x float> %y) { -; SSE2-LABEL: fptoui_v4f32_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: cmpltps %xmm3, %xmm2 -; SSE2-NEXT: cvttps2dq %xmm0, %xmm4 -; SSE2-NEXT: subps %xmm3, %xmm0 -; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: xorps %xmm5, %xmm0 -; SSE2-NEXT: andps %xmm2, %xmm4 -; SSE2-NEXT: andnps %xmm0, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: cmpltps %xmm3, %xmm0 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm4 -; SSE2-NEXT: subps %xmm3, %xmm1 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: xorps %xmm5, %xmm1 -; SSE2-NEXT: andps %xmm0, %xmm4 -; SSE2-NEXT: andnps %xmm1, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE4-LABEL: fptoui_v4f32_v4i32: -; SSE4: # %bb.0: -; SSE4-NEXT: movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE4-NEXT: movaps %xmm0, %xmm2 -; SSE4-NEXT: cmpltps %xmm4, %xmm2 -; SSE4-NEXT: cvttps2dq %xmm0, %xmm5 -; SSE4-NEXT: subps %xmm4, %xmm0 -; SSE4-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE4-NEXT: movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSE4-NEXT: xorps %xmm6, %xmm3 -; SSE4-NEXT: movaps %xmm2, %xmm0 -; SSE4-NEXT: blendvps %xmm0, %xmm5, %xmm3 -; SSE4-NEXT: movaps %xmm1, %xmm0 -; SSE4-NEXT: cmpltps %xmm4, %xmm0 -; SSE4-NEXT: cvttps2dq %xmm1, %xmm2 -; SSE4-NEXT: subps %xmm4, %xmm1 -; SSE4-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE4-NEXT: xorps %xmm6, %xmm1 -; SSE4-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE4-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE4-NEXT: movaps %xmm3, %xmm0 -; SSE4-NEXT: retq +; SSE-LABEL: fptoui_v4f32_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: cvttps2dq %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: subps {{.*}}(%rip), %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_v4f32_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX1-NEXT: vcmpltps %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vsubps %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 -; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vxorps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm3, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vcmpltps %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vsubps %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vxorps %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_v4f32_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vcmpltps %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vcvttps2dq %xmm4, %xmm4 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorps %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm3, %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vcmpltps %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vsubps %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX2-NEXT: vxorps %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX2-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_v4f32_v4i32: @@ -349,49 +297,34 @@ ; ; AVX1-LABEL: fptoui_v4f64_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX1-NEXT: vcmpltpd %ymm2, %ymm0, %ymm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vsubpd %ymm2, %ymm0, %ymm4 -; AVX1-NEXT: vcvttpd2dq %ymm4, %xmm4 -; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; AVX1-NEXT: vxorpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm3, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vcmpltpd %ymm2, %ymm1, %ymm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vsubpd %ymm2, %ymm1, %ymm2 -; AVX1-NEXT: vcvttpd2dq %ymm2, %xmm2 -; AVX1-NEXT: vxorpd %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_v4f64_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vcmpltpd %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm4 -; AVX2-NEXT: vcvttpd2dq %ymm4, %xmm4 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorpd %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm3, %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vcmpltpd %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vsubpd %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vcvttpd2dq %ymm2, %xmm2 -; AVX2-NEXT: vxorpd %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; Index: llvm/test/CodeGen/X86/ftrunc.ll =================================================================== --- llvm/test/CodeGen/X86/ftrunc.ll +++ llvm/test/CodeGen/X86/ftrunc.ll @@ -63,24 +63,20 @@ define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 { ; SSE2-LABEL: trunc_unsigned_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: cmpltps %xmm2, %xmm1 -; SSE2-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE2-NEXT: subps %xmm2, %xmm0 +; SSE2-NEXT: cvttps2dq %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: subps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: xorps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: andps %xmm1, %xmm3 -; SSE2-NEXT: andnps %xmm0, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,65535,65535] -; SSE2-NEXT: andps %xmm1, %xmm0 -; SSE2-NEXT: orps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: por {{.*}}(%rip), %xmm1 -; SSE2-NEXT: subps {{.*}}(%rip), %xmm1 -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: subps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_unsigned_v4f32: Index: llvm/test/CodeGen/X86/vec_cast3.ll =================================================================== --- llvm/test/CodeGen/X86/vec_cast3.ll +++ llvm/test/CodeGen/X86/vec_cast3.ll @@ -115,13 +115,12 @@ define <2 x i32> @cvt_v2f32_v2u32(<2 x float> %src) { ; CHECK-LABEL: cvt_v2f32_v2u32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; CHECK-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: vcvttps2dq %xmm1, %xmm1 -; CHECK-NEXT: vxorps LCPI11_1, %xmm1, %xmm1 +; CHECK-NEXT: vcvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: vpsrad $31, %xmm1, %xmm2 +; CHECK-NEXT: vsubps LCPI11_0, %xmm0, %xmm0 ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retl %res = fptoui <2 x float> %src to <2 x i32> ret <2 x i32> %res Index: llvm/test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -1199,40 +1199,34 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) { ; SSE-LABEL: fptoui_2f32_to_2i32: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: cmpltps %xmm2, %xmm1 -; SSE-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE-NEXT: subps %xmm2, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: subps {{.*}}(%rip), %xmm0 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_2f32_to_2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_2f32_to_2i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f32_to_2i32: @@ -1267,40 +1261,34 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) { ; SSE-LABEL: fptoui_4f32_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: cmpltps %xmm2, %xmm1 -; SSE-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE-NEXT: subps %xmm2, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: subps {{.*}}(%rip), %xmm0 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_4f32_to_4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: fptoui_4f32_to_4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_4f32_to_4i32: @@ -1507,28 +1495,21 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) { ; SSE-LABEL: fptoui_8f32_to_8i32: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: cmpltps %xmm4, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; SSE-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE-NEXT: subps %xmm4, %xmm0 -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE-NEXT: xorps %xmm5, %xmm0 -; SSE-NEXT: andps %xmm2, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: orps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: cmpltps %xmm4, %xmm3 -; SSE-NEXT: cvttps2dq %xmm1, %xmm0 -; SSE-NEXT: subps %xmm4, %xmm1 -; SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE-NEXT: xorps %xmm5, %xmm1 -; SSE-NEXT: andps %xmm3, %xmm0 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: subps %xmm2, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: psrad $31, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: cvttps2dq %xmm1, %xmm3 +; SSE-NEXT: subps %xmm2, %xmm1 +; SSE-NEXT: cvttps2dq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_8f32_to_8i32: @@ -1545,13 +1526,12 @@ ; AVX2-LABEL: fptoui_8f32_to_8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorps %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_8f32_to_8i32: Index: llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -81,30 +81,9 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SSE-LABEL: @fptosi_fptoui( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; SSE-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; SSE-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; SSE-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 -; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 -; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 -; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 -; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SSE-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> +; SSE-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> +; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @fptosi_fptoui( @@ -135,30 +114,9 @@ ; SLM-NEXT: ret <8 x i32> [[R7]] ; ; AVX-LABEL: @fptosi_fptoui( -; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; AVX-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; AVX-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 -; AVX-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 -; AVX-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 -; AVX-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 -; AVX-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; AVX-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; AVX-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; AVX-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; AVX-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 -; AVX-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 -; AVX-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 -; AVX-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 -; AVX-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 -; AVX-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; AVX-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; AVX-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32> +; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> ; AVX-NEXT: ret <8 x i32> [[R7]] ; ; AVX512-LABEL: @fptosi_fptoui( Index: llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll +++ llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll @@ -402,70 +402,19 @@ define void @fptoui_8f32_8i32() #0 { ; SSE-LABEL: @fptoui_8f32_8i32( -; SSE-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i32 -; SSE-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i32 -; SSE-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i32 -; SSE-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i32 -; SSE-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i32 -; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i32 -; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i32 -; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i32 -; SSE-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i32> +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; -; AVX256NODQ-LABEL: @fptoui_8f32_8i32( -; AVX256NODQ-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i32 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i32 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i32 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i32 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i32 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i32 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i32 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i32 -; AVX256NODQ-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; AVX256NODQ-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 -; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @fptoui_8f32_8i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32> -; AVX512-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @fptoui_8f32_8i32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX256DQ-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32> -; AVX256DQ-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 -; AVX256DQ-NEXT: ret void +; AVX-LABEL: @fptoui_8f32_8i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32> +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4 +; AVX-NEXT: ret void ; %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4