diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18981,43 +18981,21 @@ return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0); } - // Legalize to v4i32 type. - N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, - DAG.getUNDEF(MVT::v2i32)); + // Zero extend to 2i64, OR with the floating point representation of 2^52. + // This gives us the floating point equivalent of 2^52 + the i32 integer + // since double has 52-bits of mantissa. Then subtract 2^52 in floating + // point leaving just our i32 integers in double format. + SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0); + SDValue VBias = + DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64); + SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn, + DAG.getBitcast(MVT::v2i64, VBias)); + Or = DAG.getBitcast(MVT::v2f64, Or); - // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT, - // but using v2i32 to v2f64 with X86ISD::CVTSI2P. - SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32); - SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32); - - // Two to the power of half-word-size. - SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64); - - // Clear upper part of LO, lower HI. - SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord); - SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask); - - if (IsStrict) { - SDValue fHI = DAG.getNode(X86ISD::STRICT_CVTSI2P, DL, - {MVT::v2f64, MVT::Other}, {Op.getOperand(0), HI}); - fHI = DAG.getNode(ISD::STRICT_FMUL, DL, {MVT::v2f64, MVT::Other}, - {fHI.getValue(1), fHI, TWOHW}); - SDValue fLO = DAG.getNode(X86ISD::STRICT_CVTSI2P, DL, - {MVT::v2f64, MVT::Other}, {Op.getOperand(0), LO}); - SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - fHI.getValue(1), fLO.getValue(1)); - - // Add the two halves - return DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v2f64, MVT::Other}, - {Chain, fHI, fLO}); - } - - SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI); - fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW); - SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO); - - // Add the two halves. - return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO); + if (IsStrict) + return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other}, + {Op.getOperand(0), Or, VBias}); + return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias); } static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -836,49 +836,22 @@ } define <2 x double> @uitofp_v2i32_v2f64(<2 x i32> %x) #0 { -; SSE-32-LABEL: uitofp_v2i32_v2f64: -; SSE-32: # %bb.0: -; SSE-32-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSE-32-NEXT: pand %xmm0, %xmm1 -; SSE-32-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-32-NEXT: psrld $16, %xmm0 -; SSE-32-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-32-NEXT: mulpd {{\.LCPI.*}}, %xmm0 -; SSE-32-NEXT: addpd %xmm1, %xmm0 -; SSE-32-NEXT: retl -; -; SSE-64-LABEL: uitofp_v2i32_v2f64: -; SSE-64: # %bb.0: -; SSE-64-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSE-64-NEXT: pand %xmm0, %xmm1 -; SSE-64-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-64-NEXT: psrld $16, %xmm0 -; SSE-64-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-64-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE-64-NEXT: addpd %xmm1, %xmm0 -; SSE-64-NEXT: retq -; -; AVX1-32-LABEL: uitofp_v2i32_v2f64: -; AVX1-32: # %bb.0: -; AVX1-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-32-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-32-NEXT: vcvtdq2pd %xmm1, %xmm1 -; AVX1-32-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-32-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX1-32-NEXT: vmulpd {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX1-32-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-32-NEXT: retl +; SSE-LABEL: uitofp_v2i32_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: xorpd %xmm1, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE-NEXT: orpd %xmm1, %xmm0 +; SSE-NEXT: subpd %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} ; -; AVX1-64-LABEL: uitofp_v2i32_v2f64: -; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-64-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-64-NEXT: vcvtdq2pd %xmm1, %xmm1 -; AVX1-64-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-64-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX1-64-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX1-64-NEXT: retq +; AVX1-LABEL: uitofp_v2i32_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: uitofp_v2i32_v2f64: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -651,35 +651,27 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { ; SSE2-LABEL: uitofp_2i32_to_2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: subpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_2i32_to_2f64: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: subpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_2i32_to_2f64: ; VEX: # %bb.0: -; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 -; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 -; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i32_to_2f64: @@ -715,24 +707,19 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) { ; SSE2-LABEL: uitofp_4i32_to_2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: subpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_4i32_to_2f64: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: subpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_4i32_to_2f64: @@ -1003,44 +990,28 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { ; SSE2-LABEL: uitofp_4i32_to_4f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] -; SSE2-NEXT: mulpd %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm5 -; SSE2-NEXT: mulpd %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: cvtdq2pd %xmm4, %xmm1 -; SSE2-NEXT: addpd %xmm5, %xmm1 +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: xorpd %xmm2, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE2-NEXT: orpd %xmm3, %xmm0 +; SSE2-NEXT: subpd %xmm3, %xmm0 +; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: subpd %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_4i32_to_4f64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] -; SSE41-NEXT: mulpd %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm5 -; SSE41-NEXT: mulpd %xmm2, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm4, %xmm1 -; SSE41-NEXT: addpd %xmm5, %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE41-NEXT: por %xmm3, %xmm2 +; SSE41-NEXT: subpd %xmm3, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: subpd %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_4i32_to_4f64: @@ -3562,38 +3533,28 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) { ; SSE2-LABEL: uitofp_load_2i32_to_2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: subpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_load_2i32_to_2f64: ; SSE41: # %bb.0: -; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: subpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_load_2i32_to_2f64: ; VEX: # %bb.0: -; VEX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 -; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 -; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_2i32_to_2f64: @@ -3629,26 +3590,20 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { ; SSE2-LABEL: uitofp_load_4i32_to_2f64_2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movapd (%rdi), %xmm0 +; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: subpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_load_4i32_to_2f64_2: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: subpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_load_4i32_to_2f64_2: @@ -3697,26 +3652,20 @@ define <2 x double> @uitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) { ; SSE2-LABEL: uitofp_volatile_load_4i32_to_2f64_2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movapd (%rdi), %xmm0 +; SSE2-NEXT: xorpd %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: subpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_volatile_load_4i32_to_2f64_2: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: subpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_volatile_load_4i32_to_2f64_2: @@ -3944,46 +3893,29 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) { ; SSE2-LABEL: uitofp_load_4i32_to_4f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] -; SSE2-NEXT: mulpd %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm5 -; SSE2-NEXT: mulpd %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: cvtdq2pd %xmm4, %xmm1 -; SSE2-NEXT: addpd %xmm5, %xmm1 +; SSE2-NEXT: movapd (%rdi), %xmm1 +; SSE2-NEXT: xorpd %xmm2, %xmm2 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE2-NEXT: orpd %xmm3, %xmm0 +; SSE2-NEXT: subpd %xmm3, %xmm0 +; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: subpd %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_load_4i32_to_4f64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] -; SSE41-NEXT: mulpd %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm5 -; SSE41-NEXT: mulpd %xmm2, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm4, %xmm1 -; SSE41-NEXT: addpd %xmm5, %xmm1 +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: subpd %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: por %xmm2, %xmm1 +; SSE41-NEXT: subpd %xmm2, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_load_4i32_to_4f64: diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -6782,24 +6782,19 @@ define <2 x double> @constrained_vector_uitofp_v2f64_v2i32(<2 x i32> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1 -; CHECK-NEXT: psrld $16, %xmm0 -; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 -; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: addpd %xmm1, %xmm0 +; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; CHECK-NEXT: orpd %xmm1, %xmm0 +; CHECK-NEXT: subpd %xmm1, %xmm0 ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: vcvtdq2pd %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX1-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v2f64_v2i32: @@ -7266,22 +7261,15 @@ define <4 x double> @constrained_vector_uitofp_v4f64_v4i32(<4 x i32> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrld $16, %xmm1 -; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1 -; CHECK-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] -; CHECK-NEXT: mulpd %xmm2, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 -; CHECK-NEXT: addpd %xmm1, %xmm0 -; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: psrld $16, %xmm1 -; CHECK-NEXT: cvtdq2pd %xmm1, %xmm4 -; CHECK-NEXT: mulpd %xmm2, %xmm4 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm3 -; CHECK-NEXT: cvtdq2pd %xmm3, %xmm1 -; CHECK-NEXT: addpd %xmm4, %xmm1 +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: xorpd %xmm2, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15] +; CHECK-NEXT: orpd %xmm3, %xmm0 +; CHECK-NEXT: subpd %xmm3, %xmm0 +; CHECK-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: orpd %xmm3, %xmm1 +; CHECK-NEXT: subpd %xmm3, %xmm1 ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i32: