Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -17380,6 +17380,55 @@ DAG.getIntPtrConstant(0, dl)); } +static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, + const X86Subtarget &Subtarget) { + switch (Opcode) { + case ISD::SINT_TO_FP: + // TODO: Handle wider types with AVX/AVX512. + if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32) + return false; + // CVTDQ2PS or (V)CVTDQ2PD + return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64); + + case ISD::UINT_TO_FP: + // TODO: Handle wider types and i64 elements. + if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32) + return false; + // VCVTUDQ2PS or VCVTUDQ2PD + return ToVT == MVT::v4f32 || ToVT == MVT::v4f64; + + default: + return false; + } +} + +/// Given a scalar cast operation that is extracted from a vector, try to +/// vectorize the cast op followed by extraction. This will avoid an expensive +/// round-trip between XMM and GPR. +static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // TODO: The limitation for extracting from the 0-element is not required, + // but if we extract from some other element, it will require shuffling to + // get the result into the right place. + SDValue Extract = Cast.getOperand(0); + MVT DestVT = Cast.getSimpleValueType(); + if (!Extract.hasOneUse() || Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isNullConstant(Extract.getOperand(1))) + return SDValue(); + + SDValue VecOp = Extract.getOperand(0); + MVT FromVT = VecOp.getSimpleValueType(); + MVT ToVT = MVT::getVectorVT(DestVT, FromVT.getVectorNumElements()); + if (!useVectorCast(Cast.getOpcode(), FromVT, ToVT, Subtarget)) + return SDValue(); + + // cast (extract V, Y) --> extract (cast V), Y + SDLoc DL(Cast); + SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast, + Extract.getOperand(1)); +} + SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); @@ -17387,6 +17436,9 @@ MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) + return Extract; + if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { return DAG.getNode(X86ISD::CVTSI2P, dl, VT, @@ -17749,6 +17801,9 @@ if (Op.getSimpleValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); + if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) + return Extract; + MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); Index: test/CodeGen/X86/known-bits-vector.ll =================================================================== --- test/CodeGen/X86/known-bits-vector.ll +++ test/CodeGen/X86/known-bits-vector.ll @@ -25,8 +25,7 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %eax ; X32-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax Index: test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- test/CodeGen/X86/known-signbits-vector.ll +++ test/CodeGen/X86/known-signbits-vector.ll @@ -92,8 +92,7 @@ ; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,32768,0,0,1,0,0,0] ; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -120,8 +119,7 @@ ; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpsllq $20, %xmm0, %xmm0 -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -152,8 +150,7 @@ ; X32-NEXT: vmovd %eax, %xmm0 ; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X32-NEXT: vpsrlq $3, %xmm0, %xmm0 -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -239,8 +236,7 @@ ; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -283,8 +279,7 @@ ; X32-NEXT: vpand %xmm1, %xmm0, %xmm2 ; X32-NEXT: vpor %xmm1, %xmm2, %xmm1 ; X32-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -5556,15 +5556,12 @@ define float @extract0_sitofp_v4i32_f32(<4 x i32> %x) nounwind { ; SSE-LABEL: extract0_sitofp_v4i32_f32: ; SSE: # %bb.0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssl %eax, %xmm0 +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: extract0_sitofp_v4i32_f32: ; AVX: # %bb.0: -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 %r = sitofp i32 %e to float @@ -5581,8 +5578,7 @@ ; ; AVX-LABEL: extract0_sitofp_v4i32_f64: ; AVX: # %bb.0: -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vcvtsi2sdl %eax, %xmm1, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 %r = sitofp i32 %e to double @@ -5603,11 +5599,31 @@ ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm0 ; VEX-NEXT: retq ; -; AVX512-LABEL: extract0_uitofp_v4i32_f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vcvtusi2ssl %eax, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: extract0_uitofp_v4i32_f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract0_uitofp_v4i32_f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract0_uitofp_v4i32_f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 %r = uitofp i32 %e to float ret float %r @@ -5627,11 +5643,35 @@ ; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm0 ; VEX-NEXT: retq ; -; AVX512-LABEL: extract0_uitofp_v4i32_f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vcvtusi2sdl %eax, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: extract0_uitofp_v4i32_f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract0_uitofp_v4i32_f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract0_uitofp_v4i32_f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 %r = uitofp i32 %e to double ret double %r @@ -5643,9 +5683,7 @@ ; SSE2-LABEL: extract3_sitofp_v4i32_f32: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ssl %eax, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract3_sitofp_v4i32_f32: