Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -302,6 +302,9 @@ // Vector FP round. VFPROUND, VFPROUND_RND, VFPROUNDS_RND, + // Vector double to signed integer (truncated). + CVTTPD2DQ, + // Vector signed/unsigned integer to double. CVTDQ2PD, CVTUDQ2PD, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -855,8 +855,9 @@ setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); @@ -22237,6 +22238,27 @@ case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + if (IsSigned && N->getValueType(0) == MVT::v2i32) { + assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); + SDValue Src = N->getOperand(0); + if (Src.getValueType() == MVT::v2f64) { + SDValue Idx = DAG.getIntPtrConstant(0, dl); + SDValue Res = DAG.getNode(X86ISD::CVTTPD2DQ, dl, MVT::v4i32, Src); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); + Results.push_back(Res); + return; + } + if (Src.getValueType() == MVT::v2f32) { + SDValue Idx = DAG.getIntPtrConstant(0, dl); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getUNDEF(MVT::v2f32)); + Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, Res); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); + Results.push_back(Res); + return; + } + } + std::pair Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; @@ -22555,6 +22577,7 @@ case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND"; case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND"; + case X86ISD::CVTTPD2DQ: return "X86ISD::CVTTPD2DQ"; case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK"; Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -6262,6 +6262,16 @@ VR128X:$src1, sub_xmm)))), sub_ymm)>; } +let Predicates = [HasAVX512, HasVLX] in { + def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))))), + (VCVTTPD2DQZ128rr VR128:$src)>; + def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128X:$src))), + (VCVTTPD2DQZ128rr VR128X:$src)>; + def : Pat<(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))), + (VCVTTPD2DQZ128rm addr:$src)>; +} + let Predicates = [HasAVX512] in { def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))), (VCVTPD2PSZrm addr:$src)>; Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -67,6 +67,9 @@ def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; +def X86cvttpd2dq: SDNode<"X86ISD::CVTTPD2DQ", + SDTypeProfile<1, 1, [SDTCisVT<0, v4i32>, + SDTCisVT<1, v2f64>]>>; def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD", SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, SDTCisVT<1, v4i32>]>>; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -2111,6 +2111,14 @@ (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))))), + (VCVTTPD2DQrr VR128:$src)>; + def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))), + (VCVTTPD2DQrr VR128:$src)>; + def : Pat<(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))), + (VCVTTPD2DQXrm addr:$src)>; + def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), (VCVTTPD2DQYrr VR256:$src)>; def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), @@ -2128,6 +2136,16 @@ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>; +let Predicates = [UseSSE2] in { + def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))))), + (CVTTPD2DQrr VR128:$src)>; + def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))), + (CVTTPD2DQrr VR128:$src)>; + def : Pat<(v4i32 (X86cvttpd2dq (memopv2f64 addr:$src))), + (CVTTPD2DQrm addr:$src)>; +} // Predicates = [UseSSE2] + // Convert packed single to packed double let Predicates = [HasAVX] in { // SSE2 instructions without OpSize prefix Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -805,6 +805,8 @@ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, Index: test/Analysis/CostModel/X86/fptosi.ll =================================================================== --- test/Analysis/CostModel/X86/fptosi.ll +++ test/Analysis/CostModel/X86/fptosi.ll @@ -44,20 +44,20 @@ ; AVX2: cost of 1 {{.*}} %I32 = fptosi ; AVX512: cost of 1 {{.*}} %I32 = fptosi %I32 = fptosi double undef to i32 - ; SSE2: cost of 6 {{.*}} %V2I32 = fptosi - ; SSE42: cost of 6 {{.*}} %V2I32 = fptosi - ; AVX1: cost of 6 {{.*}} %V2I32 = fptosi - ; AVX2: cost of 6 {{.*}} %V2I32 = fptosi - ; AVX512: cost of 6 {{.*}} %V2I32 = fptosi + ; SSE2: cost of 3 {{.*}} %V2I32 = fptosi + ; SSE42: cost of 3 {{.*}} %V2I32 = fptosi + ; AVX1: cost of 3 {{.*}} %V2I32 = fptosi + ; AVX2: cost of 3 {{.*}} %V2I32 = fptosi + ; AVX512: cost of 3 {{.*}} %V2I32 = fptosi %V2I32 = fptosi <2 x double> undef to <2 x i32> - ; SSE2: cost of 13 {{.*}} %V4I32 = fptosi - ; SSE42: cost of 13 {{.*}} %V4I32 = fptosi + ; SSE2: cost of 7 {{.*}} %V4I32 = fptosi + ; SSE42: cost of 7 {{.*}} %V4I32 = fptosi ; AVX1: cost of 1 {{.*}} %V4I32 = fptosi ; AVX2: cost of 1 {{.*}} %V4I32 = fptosi ; AVX512: cost of 1 {{.*}} %V4I32 = fptosi %V4I32 = fptosi <4 x double> undef to <4 x i32> - ; SSE2: cost of 27 {{.*}} %V8I32 = fptosi - ; SSE42: cost of 27 {{.*}} %V8I32 = fptosi + ; SSE2: cost of 15 {{.*}} %V8I32 = fptosi + ; SSE42: cost of 15 {{.*}} %V8I32 = fptosi ; AVX1: cost of 3 {{.*}} %V8I32 = fptosi ; AVX2: cost of 3 {{.*}} %V8I32 = fptosi ; AVX512: cost of 1 {{.*}} %V8I32 = fptosi Index: test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- test/CodeGen/X86/vec_fp_to_int.ll +++ test/CodeGen/X86/vec_fp_to_int.ll @@ -56,46 +56,18 @@ define <4 x i32> @fptosi_2f64_to_4i32(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_4i32: ; SSE: # BB#0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,2] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f64_to_4i32: ; AVX: # BB#0: -; AVX-NEXT: vcvttsd2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: vcvttsd2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: fptosi_2f64_to_4i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: vcvttsd2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vcvttsd2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512F-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f64_to_4i32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vcvttpd2qq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512DQ-NEXT: retq +; AVX512-LABEL: fptosi_2f64_to_4i32: +; AVX512: # BB#0: +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %ext @@ -104,39 +76,21 @@ define <2 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i32: ; SSE: # BB#0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f64_to_2i32: ; AVX: # BB#0: -; AVX-NEXT: vcvttsd2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: vcvttsd2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq ; -; AVX512F-LABEL: fptosi_2f64_to_2i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: vcvttsd2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vcvttsd2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f64_to_2i32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vcvttpd2qq %xmm0, %xmm0 -; AVX512DQ-NEXT: retq +; AVX512-LABEL: fptosi_2f64_to_2i32: +; AVX512: # BB#0: +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i32> ret <2 x i32> %cvt } @@ -144,17 +98,8 @@ define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) { ; SSE-LABEL: fptosi_4f64_to_2i32: ; SSE: # BB#0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: cvttpd2dq %xmm0, %xmm1 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; @@ -259,20 +204,8 @@ define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) { ; SSE-LABEL: fptosi_4f64_to_4i32: ; SSE: # BB#0: -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; @@ -713,33 +646,20 @@ define <2 x i32> @fptosi_2f32_to_2i32(<2 x float> %a) { ; SSE-LABEL: fptosi_2f32_to_2i32: ; SSE: # BB#0: -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movd %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f32_to_2i32: ; AVX: # BB#0: -; AVX-NEXT: vcvttss2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-NEXT: vcvttss2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq ; ; AVX512-LABEL: fptosi_2f32_to_2i32: ; AVX512: # BB#0: -; AVX512-NEXT: vcvttss2si %xmm0, %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512-NEXT: vcvttss2si %xmm0, %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i32> ret <2 x i32> %cvt Index: test/Transforms/SLPVectorizer/X86/fptosi.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/fptosi.ll +++ test/Transforms/SLPVectorizer/X86/fptosi.ll @@ -108,30 +108,12 @@ define void @fptosi_8f64_8i32() #0 { ; SSE-LABEL: @fptosi_8f64_8i32( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i32 -; SSE-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i32 -; SSE-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i32 -; SSE-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i32 -; SSE-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i32 -; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i32 -; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i32 -; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i32 -; SSE-NEXT: store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i32> +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @fptosi_8f64_8i32(