Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -2415,6 +2415,15 @@ return false; } + /// If the target supports the specified vector cast operation, use it to + /// avoid the more expensive scalarized version of the cast operation. + /// This is used pre-legalization, so the target may return true even if the + /// types are not legal (legalization will transform the operation into a + /// profitable vector instruction sequence). + virtual bool useVectorCast(unsigned Opcode, EVT FromVT, EVT ToVT) const { + return false; + } + // Return true if it is profitable to use a scalar input to a BUILD_VECTOR // even if the vector itself has multiple uses. virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const { Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11906,6 +11906,29 @@ return SDValue(); } +/// Given a scalar cast operation that is extracted from a vector, ask the +/// target if it is profitable to vectorize the cast op followed by extraction. +/// This may avoid an expensive round-trip between vector and scalar registers. +static SDValue vectorizeExtractedCast(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!N0.hasOneUse() || N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue VecOp = N0.getOperand(0); + EVT FromVT = VecOp.getValueType(); + EVT ToVT = EVT::getVectorVT(*DAG.getContext(), VT, + FromVT.getVectorNumElements()); + if (!TLI.useVectorCast(N->getOpcode(), FromVT, ToVT)) + return SDValue(); + + // cast (extract V, Y) --> extract (cast V), Y + SDLoc DL(N); + SDValue VCast = DAG.getNode(N->getOpcode(), DL, ToVT, VecOp); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCast, N0.getOperand(1)); +} + SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -11960,6 +11983,10 @@ if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI)) return FTrunc; + if (!LegalOperations) + if (SDValue Extract = vectorizeExtractedCast(N, DAG)) + return Extract; + return SDValue(); } @@ -12002,6 +12029,10 @@ if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI)) return FTrunc; + if (!LegalOperations) + if (SDValue Extract = vectorizeExtractedCast(N, DAG)) + return Extract; + return SDValue(); } Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -1063,6 +1063,9 @@ /// supported. bool shouldScalarizeBinop(SDValue) const override; + /// Vector casts can be used to avoid transfers between scalar and vector. + bool useVectorCast(unsigned Opcode, EVT FromVT, EVT ToVT) const override; + bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, unsigned AddrSpace) const override { // If we can replace more than 2 scalar stores, there will be a reduction Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -4920,6 +4920,28 @@ return isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), ScalarVT); } +bool X86TargetLowering::useVectorCast(unsigned Opcode, EVT FromVT, + EVT ToVT) const { + switch (Opcode) { + case ISD::SINT_TO_FP: + // TODO: Handle wider types with AVX/AVX512. + if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32) + return false; + // CVTDQ2PS or (V)CVTDQ2PD (pre-AVX, this will legalize to 128-bit) + return ToVT == MVT::v4f32 || ToVT == MVT::v4f64; + + case ISD::UINT_TO_FP: + // TODO: Handle wider types and i64 elements. + if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32) + return false; + // VCVTUDQ2PS or VCVTUDQ2PD + return ToVT == MVT::v4f32 || ToVT == MVT::v4f64; + + default: + return false; + } +} + bool X86TargetLowering::isCheapToSpeculateCttz() const { // Speculate cttz only if we can directly use TZCNT. return Subtarget.hasBMI(); Index: test/CodeGen/X86/known-bits-vector.ll =================================================================== --- test/CodeGen/X86/known-bits-vector.ll +++ test/CodeGen/X86/known-bits-vector.ll @@ -25,8 +25,7 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %eax ; X32-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax Index: test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- test/CodeGen/X86/known-signbits-vector.ll +++ test/CodeGen/X86/known-signbits-vector.ll @@ -65,8 +65,8 @@ ; X32-LABEL: signbits_ashr_extract_sitofp_0: ; X32: # %bb.0: ; X32-NEXT: pushl %eax -; X32-NEXT: vextractps $1, %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 +; X32-NEXT: vpsrlq $32, %xmm0, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -88,12 +88,13 @@ ; X32-LABEL: signbits_ashr_extract_sitofp_1: ; X32: # %bb.0: ; X32-NEXT: pushl %eax +; X32-NEXT: vpsrlq $63, %xmm0, %xmm1 ; X32-NEXT: vpsrlq $32, %xmm0, %xmm0 +; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,32768,0,0,1,0,0,0] ; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -115,13 +116,16 @@ ; X32-LABEL: signbits_ashr_shl_extract_sitofp: ; X32: # %bb.0: ; X32-NEXT: pushl %eax +; X32-NEXT: vpsrlq $60, %xmm0, %xmm1 ; X32-NEXT: vpsrlq $61, %xmm0, %xmm0 +; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [4,0,0,0,8,0,0,0] ; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; X32-NEXT: vpsllq $16, %xmm0, %xmm1 ; X32-NEXT: vpsllq $20, %xmm0, %xmm0 -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 +; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -151,9 +155,10 @@ ; X32-NEXT: sarl $30, %ecx ; X32-NEXT: vmovd %eax, %xmm0 ; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X32-NEXT: vpsrlq $3, %xmm0, %xmm0 -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -233,14 +238,15 @@ ; X32-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp: ; X32: # %bb.0: ; X32-NEXT: pushl %eax +; X32-NEXT: vpsrlq $60, %xmm0, %xmm1 ; X32-NEXT: vpsrlq $61, %xmm0, %xmm0 +; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [4,0,0,0,8,0,0,0] ; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -283,8 +289,7 @@ ; X32-NEXT: vpand %xmm1, %xmm0, %xmm2 ; X32-NEXT: vpor %xmm1, %xmm2, %xmm1 ; X32-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; X32-NEXT: vmovd %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0 +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -5556,15 +5556,12 @@ define float @extract0_sitofp_v4i32_f32(<4 x i32> %x) nounwind { ; SSE-LABEL: extract0_sitofp_v4i32_f32: ; SSE: # %bb.0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssl %eax, %xmm0 +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: extract0_sitofp_v4i32_f32: ; AVX: # %bb.0: -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 %r = sitofp i32 %e to float @@ -5574,15 +5571,12 @@ define double @extract0_sitofp_v4i32_f64(<4 x i32> %x) nounwind { ; SSE-LABEL: extract0_sitofp_v4i32_f64: ; SSE: # %bb.0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2sdl %eax, %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: extract0_sitofp_v4i32_f64: ; AVX: # %bb.0: -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vcvtsi2sdl %eax, %xmm1, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 %r = sitofp i32 %e to double @@ -5603,11 +5597,31 @@ ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm0 ; VEX-NEXT: retq ; -; AVX512-LABEL: extract0_uitofp_v4i32_f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vcvtusi2ssl %eax, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: extract0_uitofp_v4i32_f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract0_uitofp_v4i32_f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract0_uitofp_v4i32_f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 %r = uitofp i32 %e to float ret float %r @@ -5627,11 +5641,35 @@ ; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm0 ; VEX-NEXT: retq ; -; AVX512-LABEL: extract0_uitofp_v4i32_f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vcvtusi2sdl %eax, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: extract0_uitofp_v4i32_f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract0_uitofp_v4i32_f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract0_uitofp_v4i32_f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 %r = uitofp i32 %e to double ret double %r @@ -5640,25 +5678,16 @@ ; Extract non-zero element from int vector and convert to FP. define float @extract3_sitofp_v4i32_f32(<4 x i32> %x) nounwind { -; SSE2-LABEL: extract3_sitofp_v4i32_f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ssl %eax, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract3_sitofp_v4i32_f32: -; SSE41: # %bb.0: -; SSE41-NEXT: extractps $3, %xmm0, %eax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ssl %eax, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: extract3_sitofp_v4i32_f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: retq ; ; AVX-LABEL: extract3_sitofp_v4i32_f32: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $3, %xmm0, %eax -; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = sitofp i32 %e to float @@ -5666,25 +5695,19 @@ } define double @extract3_sitofp_v4i32_f64(<4 x i32> %x) nounwind { -; SSE2-LABEL: extract3_sitofp_v4i32_f64: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sdl %eax, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract3_sitofp_v4i32_f64: -; SSE41: # %bb.0: -; SSE41-NEXT: extractps $3, %xmm0, %eax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2sdl %eax, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: extract3_sitofp_v4i32_f64: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: retq ; ; AVX-LABEL: extract3_sitofp_v4i32_f64: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $3, %xmm0, %eax -; AVX-NEXT: vcvtsi2sdl %eax, %xmm1, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = sitofp i32 %e to double @@ -5713,11 +5736,33 @@ ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm0 ; VEX-NEXT: retq ; -; AVX512-LABEL: extract3_uitofp_v4i32_f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractps $3, %xmm0, %eax -; AVX512-NEXT: vcvtusi2ssl %eax, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: extract3_uitofp_v4i32_f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract3_uitofp_v4i32_f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = uitofp i32 %e to float ret float %r @@ -5745,11 +5790,39 @@ ; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm0 ; VEX-NEXT: retq ; -; AVX512-LABEL: extract3_uitofp_v4i32_f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractps $3, %xmm0, %eax -; AVX512-NEXT: vcvtusi2sdl %eax, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: extract3_uitofp_v4i32_f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract3_uitofp_v4i32_f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = uitofp i32 %e to double ret double %r