diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19141,6 +19141,45 @@ DAG.getIntPtrConstant(0, DL)); } +/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), +/// try to vectorize the cast ops. This will avoid an expensive round-trip +/// between XMM and GPR. +static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // TODO: Allow FP_TO_UINT. + SDValue CastToInt = CastToFP.getOperand(0); + MVT VT = CastToFP.getSimpleValueType(); + if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector()) + return SDValue(); + + MVT IntVT = CastToInt.getSimpleValueType(); + SDValue X = CastToInt.getOperand(0); + // TODO: Allow size-changing from source to dest (double -> i32 -> float) + if (X.getSimpleValueType() != VT || + VT.getSizeInBits() != IntVT.getSizeInBits()) + return SDValue(); + + // See if we have a 128-bit vector cast op for this type of cast. + unsigned NumEltsInXMM = 128 / VT.getScalarSizeInBits(); + MVT Vec128VT = MVT::getVectorVT(VT, NumEltsInXMM); + MVT Int128VT = MVT::getVectorVT(IntVT, NumEltsInXMM); + if (!useVectorCast(CastToFP.getOpcode(), Int128VT, Vec128VT, Subtarget)) + return SDValue(); + + // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0 + // + // We are not defining the high elements (for example, zero them) because + // that could nullify any performance advantage that we hoped to gain from + // this vector op hack. We do not expect any adverse effects (like denorm + // penalties) with cast ops. + SDLoc DL(CastToFP); + SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL); + SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, X); + SDValue VCastToInt = DAG.getNode(ISD::FP_TO_SINT, DL, Int128VT, VecX); + SDValue VCastToFP = DAG.getNode(ISD::SINT_TO_FP, DL, Vec128VT, VCastToInt); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx); +} + static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(Op); @@ -19243,6 +19282,9 @@ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; + if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget)) + return R; + if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { // Note: Since v2f64 is a legal type. We don't need to zero extend the diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll --- a/llvm/test/CodeGen/X86/ftrunc.ll +++ b/llvm/test/CodeGen/X86/ftrunc.ll @@ -223,15 +223,14 @@ define float @trunc_signed_f32_no_fast_math(float %x) { ; SSE-LABEL: trunc_signed_f32_no_fast_math: ; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %eax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ss %eax, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_signed_f32_no_fast_math: ; AVX1: # %bb.0: -; AVX1-NEXT: vcvttss2si %xmm0, %eax -; AVX1-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX1-NEXT: retq %i = fptosi float %x to i32 %r = sitofp i32 %i to float @@ -241,9 +240,8 @@ define float @trunc_signed_f32(float %x) #0 { ; SSE2-LABEL: trunc_signed_f32: ; SSE2: # %bb.0: -; SSE2-NEXT: cvttss2si %xmm0, %eax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %eax, %xmm0 +; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_signed_f32: