This is an archive of the discontinued LLVM Phabricator instance.

[x86] use vector instructions to lower even more FP->int->FP casts
ClosedPublic

Authored by spatel on Apr 23 2020, 2:03 PM.

Download Raw Diff

Details

Reviewers

RKSimon
craig.topper
pcordes

Commits

rG7f4ff782d406: [x86] use vector instructions to lower even more FP->int->FP casts

Summary

This is another enhancement to D77895/D78362 to avoid a round-trip from XMM->GPR->XMM.
This time we handle the case of starting/ending with different FP types but always with signed i32 as the intermediate value.
I think this covers all of the faux vector optimization possibilities for pre-AVX512.

There is at least 1 other transform mentioned in PR36617:
https://bugs.llvm.org/show_bug.cgi?id=36617#c19
...where we fold an 'fpext' into a preceding 'sitofp'. I think we will want to handle that earlier (DAGCombiner or instcombine) because that's a target-independent optimization.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

spatel created this revision.Apr 23 2020, 2:03 PM

Herald added subscribers: hiraditya, mcrosier. · View Herald TranscriptApr 23 2020, 2:03 PM

LGTM

This revision is now accepted and ready to land.Apr 23 2020, 2:42 PM

LGTM - cheers!

Closed by commit rG7f4ff782d406: [x86] use vector instructions to lower even more FP->int->FP casts (authored by spatel). · Explain WhyApr 25 2020, 9:00 AM

This revision was automatically updated to reflect the committed changes.

Herald added a project: Restricted Project. · View Herald TranscriptApr 25 2020, 9:00 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86ISelLowering.cpp

25 lines

test/

CodeGen/

X86/

ftrunc.ll

36 lines

Diff 260110

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 19,175 Lines • ▼ Show 20 Lines	static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
// TODO: Allow FP_TO_UINT.		// TODO: Allow FP_TO_UINT.
SDValue CastToInt = CastToFP.getOperand(0);		SDValue CastToInt = CastToFP.getOperand(0);
MVT VT = CastToFP.getSimpleValueType();		MVT VT = CastToFP.getSimpleValueType();
if (CastToInt.getOpcode() != ISD::FP_TO_SINT \|\| VT.isVector())		if (CastToInt.getOpcode() != ISD::FP_TO_SINT \|\| VT.isVector())
return SDValue();		return SDValue();

MVT IntVT = CastToInt.getSimpleValueType();		MVT IntVT = CastToInt.getSimpleValueType();
SDValue X = CastToInt.getOperand(0);		SDValue X = CastToInt.getOperand(0);
// TODO: Allow size-changing from source to dest (double -> i32 -> float)		MVT SrcVT = X.getSimpleValueType();
if (X.getSimpleValueType() != VT)		if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
return SDValue();		return SDValue();

// See if we have 128-bit vector cast instructions for this type of cast.		// See if we have 128-bit vector cast instructions for this type of cast.
// We need cvttps2dq + cvtdq2ps or cvttpd2dq + cvtdq2pd.		// We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
if (!Subtarget.hasSSE2() \|\| (VT != MVT::f32 && VT != MVT::f64) \|\|		if (!Subtarget.hasSSE2() \|\| (VT != MVT::f32 && VT != MVT::f64) \|\|
IntVT != MVT::i32)		IntVT != MVT::i32)
return SDValue();		return SDValue();

unsigned NumFPEltsInXMM = 128 / VT.getScalarSizeInBits();		unsigned SrcSize = SrcVT.getSizeInBits();
unsigned NumIntEltsInXMM = 128 / IntVT.getScalarSizeInBits();		unsigned IntSize = IntVT.getSizeInBits();
MVT VecFPVT = MVT::getVectorVT(VT, NumFPEltsInXMM);		unsigned VTSize = VT.getSizeInBits();
MVT VecIntVT = MVT::getVectorVT(IntVT, NumIntEltsInXMM);		MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
		MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
		MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);

// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.		// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
bool NeedX86Opcodes = VT.getSizeInBits() != IntVT.getSizeInBits();
unsigned ToIntOpcode =		unsigned ToIntOpcode =
NeedX86Opcodes ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;		SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
unsigned ToFPOpcode =		unsigned ToFPOpcode =
NeedX86Opcodes ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;		IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;

// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0		// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
//		//
// We are not defining the high elements (for example, zero them) because		// We are not defining the high elements (for example, zero them) because
// that could nullify any performance advantage that we hoped to gain from		// that could nullify any performance advantage that we hoped to gain from
// this vector op hack. We do not expect any adverse effects (like denorm		// this vector op hack. We do not expect any adverse effects (like denorm
// penalties) with cast ops.		// penalties) with cast ops.
SDLoc DL(CastToFP);		SDLoc DL(CastToFP);
SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);		SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecFPVT, X);		SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);		SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecFPVT, VCastToInt);		SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);		return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
}		}

static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,		static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
SDLoc DL(Op);		SDLoc DL(Op);
bool IsStrict = Op->isStrictFPOpcode();		bool IsStrict = Op->isStrictFPOpcode();
MVT VT = Op->getSimpleValueType(0);		MVT VT = Op->getSimpleValueType(0);
▲ Show 20 Lines • Show All 29,676 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/ftrunc.ll

Show First 20 Lines • Show All 295 Lines • ▼ Show 20 Lines	; AVX1-NEXT: retq
%i = fptosi double %x to i32		%i = fptosi double %x to i32
%r = sitofp i32 %i to double		%r = sitofp i32 %i to double
ret double %r		ret double %r
}		}

define double @trunc_f32_signed32_f64_no_fast_math(float %x) {		define double @trunc_f32_signed32_f64_no_fast_math(float %x) {
; SSE-LABEL: trunc_f32_signed32_f64_no_fast_math:		; SSE-LABEL: trunc_f32_signed32_f64_no_fast_math:
; SSE: # %bb.0:		; SSE: # %bb.0:
; SSE-NEXT: cvttss2si %xmm0, %eax		; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: xorps %xmm0, %xmm0		; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: cvtsi2sd %eax, %xmm0
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX1-LABEL: trunc_f32_signed32_f64_no_fast_math:		; AVX1-LABEL: trunc_f32_signed32_f64_no_fast_math:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: vcvttss2si %xmm0, %eax		; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2sd %eax, %xmm1, %xmm0		; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX1-NEXT: retq		; AVX1-NEXT: retq
%i = fptosi float %x to i32		%i = fptosi float %x to i32
%r = sitofp i32 %i to double		%r = sitofp i32 %i to double
ret double %r		ret double %r
}		}

define double @trunc_f32_signed32_f64_nsz(float %x) #0 {		define double @trunc_f32_signed32_f64_nsz(float %x) #0 {
; SSE-LABEL: trunc_f32_signed32_f64_nsz:		; SSE-LABEL: trunc_f32_signed32_f64_nsz:
; SSE: # %bb.0:		; SSE: # %bb.0:
; SSE-NEXT: cvttss2si %xmm0, %eax		; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: xorps %xmm0, %xmm0		; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: cvtsi2sd %eax, %xmm0
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX1-LABEL: trunc_f32_signed32_f64_nsz:		; AVX1-LABEL: trunc_f32_signed32_f64_nsz:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: vcvttss2si %xmm0, %eax		; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2sd %eax, %xmm1, %xmm0		; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX1-NEXT: retq		; AVX1-NEXT: retq
%i = fptosi float %x to i32		%i = fptosi float %x to i32
%r = sitofp i32 %i to double		%r = sitofp i32 %i to double
ret double %r		ret double %r
}		}

define float @trunc_f64_signed32_f32_no_fast_math(double %x) {		define float @trunc_f64_signed32_f32_no_fast_math(double %x) {
; SSE-LABEL: trunc_f64_signed32_f32_no_fast_math:		; SSE-LABEL: trunc_f64_signed32_f32_no_fast_math:
; SSE: # %bb.0:		; SSE: # %bb.0:
; SSE-NEXT: cvttsd2si %xmm0, %eax		; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: xorps %xmm0, %xmm0		; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ss %eax, %xmm0
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX1-LABEL: trunc_f64_signed32_f32_no_fast_math:		; AVX1-LABEL: trunc_f64_signed32_f32_no_fast_math:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: vcvttsd2si %xmm0, %eax		; AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0		; AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX1-NEXT: retq		; AVX1-NEXT: retq
%i = fptosi double %x to i32		%i = fptosi double %x to i32
%r = sitofp i32 %i to float		%r = sitofp i32 %i to float
ret float %r		ret float %r
}		}

define float @trunc_f64_signed32_f32_nsz(double %x) #0 {		define float @trunc_f64_signed32_f32_nsz(double %x) #0 {
; SSE-LABEL: trunc_f64_signed32_f32_nsz:		; SSE-LABEL: trunc_f64_signed32_f32_nsz:
; SSE: # %bb.0:		; SSE: # %bb.0:
; SSE-NEXT: cvttsd2si %xmm0, %eax		; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: xorps %xmm0, %xmm0		; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ss %eax, %xmm0
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX1-LABEL: trunc_f64_signed32_f32_nsz:		; AVX1-LABEL: trunc_f64_signed32_f32_nsz:
; AVX1: # %bb.0:		; AVX1: # %bb.0:
; AVX1-NEXT: vcvttsd2si %xmm0, %eax		; AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0		; AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX1-NEXT: retq		; AVX1-NEXT: retq
%i = fptosi double %x to i32		%i = fptosi double %x to i32
%r = sitofp i32 %i to float		%r = sitofp i32 %i to float
ret float %r		ret float %r
}		}

define double @trunc_signed_f64_no_fast_math(double %x) {		define double @trunc_signed_f64_no_fast_math(double %x) {
; SSE-LABEL: trunc_signed_f64_no_fast_math:		; SSE-LABEL: trunc_signed_f64_no_fast_math:
▲ Show 20 Lines • Show All 163 Lines • Show Last 20 Lines