Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17578,28 +17578,41 @@ /// round-trip between XMM and GPR. static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // TODO: The limitation for extracting from the 0-element is not required, - // but if we extract from some other element, it will require shuffling to - // get the result into the right place. // TODO: This could be enhanced to handle smaller integer types by peeking // through an extend. SDValue Extract = Cast.getOperand(0); MVT DestVT = Cast.getSimpleValueType(); if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - !isNullConstant(Extract.getOperand(1))) + !isa(Extract.getOperand(1))) return SDValue(); + // See if we have a 128-bit vector cast op for this type of cast. SDValue VecOp = Extract.getOperand(0); MVT FromVT = VecOp.getSimpleValueType(); - MVT ToVT = MVT::getVectorVT(DestVT, FromVT.getVectorNumElements()); - if (!useVectorCast(Cast.getOpcode(), FromVT, ToVT, Subtarget)) + unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits(); + MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM); + MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM); + if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget)) return SDValue(); - // cast (extract V, Y) --> extract (cast V), Y + // If we are extracting from a non-zero element, first shuffle the source + // vector to allow extracting from element zero. SDLoc DL(Cast); + if (!isNullConstant(Extract.getOperand(1))) { + SmallVector Mask(FromVT.getVectorNumElements(), -1); + Mask[0] = Extract.getConstantOperandVal(1); + VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask); + } + // If the source vector is wider than 128-bits, extract the low part. Do not + // create an unnecessarily wide vector cast op. + if (FromVT != Vec128VT) + VecOp = extract128BitVector(VecOp, 0, DAG, DL); + + // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0 + // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast, - Extract.getOperand(1)); + DAG.getIntPtrConstant(0, DL)); } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, Index: llvm/test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- llvm/test/CodeGen/X86/known-signbits-vector.ll +++ llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -65,8 +65,8 @@ ; X32-LABEL: signbits_ashr_extract_sitofp_0: ; X32: # %bb.0: ; X32-NEXT: pushl %eax -; X32-NEXT: vextractps $1, %xmm0, %eax -; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 +; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax Index: llvm/test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -5726,23 +5726,16 @@ ; Extract non-zero element from int vector and convert to FP. define float @extract3_sitofp_v4i32_f32(<4 x i32> %x) nounwind { -; SSE2-LABEL: extract3_sitofp_v4i32_f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract3_sitofp_v4i32_f32: -; SSE41: # %bb.0: -; SSE41-NEXT: extractps $3, %xmm0, %eax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ssl %eax, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: extract3_sitofp_v4i32_f32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: extract3_sitofp_v4i32_f32: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $3, %xmm0, %eax -; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = sitofp i32 %e to float @@ -5767,8 +5760,8 @@ ; ; AVX-LABEL: extract3_sitofp_v4i32_f64: ; AVX: # %bb.0: -; AVX-NEXT: vextractps $3, %xmm0, %eax -; AVX-NEXT: vcvtsi2sdl %eax, %xmm1, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = sitofp i32 %e to double @@ -5797,11 +5790,33 @@ ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm0 ; VEX-NEXT: retq ; -; AVX512-LABEL: extract3_uitofp_v4i32_f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractps $3, %xmm0, %eax -; AVX512-NEXT: vcvtusi2ssl %eax, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: extract3_uitofp_v4i32_f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract3_uitofp_v4i32_f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = uitofp i32 %e to float ret float %r @@ -5829,11 +5844,37 @@ ; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm0 ; VEX-NEXT: retq ; -; AVX512-LABEL: extract3_uitofp_v4i32_f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextractps $3, %xmm0, %eax -; AVX512-NEXT: vcvtusi2sdl %eax, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: extract3_uitofp_v4i32_f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract3_uitofp_v4i32_f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = uitofp i32 %e to double ret double %r