Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -852,6 +852,7 @@ setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); @@ -1203,7 +1204,6 @@ setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); @@ -14068,6 +14068,41 @@ return Sub; } +static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget, SDLoc &DL) { + if (Op.getSimpleValueType() != MVT::v2f64) + return SDValue(); + + SDValue N0 = Op.getOperand(0); + assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"); + + // Legalize to v4i32 type. + N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, + DAG.getUNDEF(MVT::v2i32)); + + if (Subtarget.hasAVX512()) + return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0); + + // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT, + // but using v2i32 to v2f64 with X86ISD::CVTSI2P. + SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32); + SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32); + + // Two to the power of half-word-size. + SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64); + + // Clear upper part of LO, lower HI. + SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord); + SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask); + + SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI); + fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW); + SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO); + + // Add the two halves. + return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO); +} + static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // The algorithm is the following: @@ -14181,13 +14216,6 @@ switch (SrcVT.SimpleTy) { default: llvm_unreachable("Custom UINT_TO_FP is not supported!"); - case MVT::v2i32: { - if (VT == MVT::v2f64) - return DAG.getNode(X86ISD::CVTUI2P, dl, VT, - DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, N0, - DAG.getUNDEF(SrcVT))); - return SDValue(); - } case MVT::v4i8: case MVT::v4i16: case MVT::v8i8: @@ -14196,6 +14224,8 @@ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); } + case MVT::v2i32: + return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl); case MVT::v4i32: case MVT::v8i32: return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget); Index: llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll +++ llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll @@ -511,33 +511,24 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { ; SSE-LABEL: uitofp_2i32_to_2f64: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] -; SSE-NEXT: subpd %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE-NEXT: addpd %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: subpd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSE-NEXT: addpd %xmm2, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; VEX-LABEL: uitofp_2i32_to_2f64: ; VEX: # BB#0: -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; VEX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] -; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 -; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 -; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0 +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i32_to_2f64: @@ -571,20 +562,13 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) { ; SSE-LABEL: uitofp_4i32_to_2f64: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] -; SSE-NEXT: subpd %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE-NEXT: addpd %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: subpd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSE-NEXT: addpd %xmm2, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: uitofp_4i32_to_2f64: @@ -868,32 +852,23 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { ; SSE-LABEL: uitofp_4i32_to_4f64: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE-NEXT: movapd {{.*#+}} xmm2 = [6.553600e+04,6.553600e+04] +; SSE-NEXT: mulpd %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25] -; SSE-NEXT: subpd %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] -; SSE-NEXT: addpd %xmm6, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: subpd %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1] -; SSE-NEXT: addpd %xmm4, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: subpd %xmm5, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSE-NEXT: addpd %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: subpd %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] -; SSE-NEXT: addpd %xmm4, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: cvtdq2pd %xmm1, %xmm5 +; SSE-NEXT: mulpd %xmm2, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: cvtdq2pd %xmm4, %xmm1 +; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: uitofp_4i32_to_4f64: @@ -2975,34 +2950,26 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) { ; SSE-LABEL: uitofp_load_2i32_to_2f64: ; SSE: # BB#0: -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] -; SSE-NEXT: subpd %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: mulpd {{.*}}(%rip), %xmm0 ; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: subpd %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] -; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; VEX-LABEL: uitofp_load_2i32_to_2f64: ; VEX: # BB#0: -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; VEX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25] -; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 -; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 -; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0 +; VEX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_2i32_to_2f64: @@ -3266,33 +3233,24 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) { ; SSE-LABEL: uitofp_load_4i32_to_4f64: ; SSE: # BB#0: -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1127219200,1160773632,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movapd {{.*#+}} xmm6 = [4.503600e+15,1.934281e+25] -; SSE-NEXT: subpd %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: subpd %xmm6, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1] -; SSE-NEXT: addpd %xmm5, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: subpd %xmm6, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1] -; SSE-NEXT: addpd %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: subpd %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE-NEXT: movapd {{.*#+}} xmm2 = [6.553600e+04,6.553600e+04] +; SSE-NEXT: mulpd %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: cvtdq2pd %xmm1, %xmm5 +; SSE-NEXT: mulpd %xmm2, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: cvtdq2pd %xmm4, %xmm1 +; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: uitofp_load_4i32_to_4f64: