Diff 150542

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 667 Lines • ▼ Show 20 Lines	static char isNegatibleForFree(SDValue Op, bool LegalOperations,
const TargetLowering &TLI,		const TargetLowering &TLI,
const TargetOptions *Options,		const TargetOptions *Options,
unsigned Depth = 0) {		unsigned Depth = 0) {
// fneg is removable even if it has multiple uses.		// fneg is removable even if it has multiple uses.
if (Op.getOpcode() == ISD::FNEG) return 2;		if (Op.getOpcode() == ISD::FNEG) return 2;

// Don't allow anything with multiple uses unless we know it is free.		// Don't allow anything with multiple uses unless we know it is free.
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
		const SDNodeFlags Flags = Op->getFlags();
if (!Op.hasOneUse())		if (!Op.hasOneUse())
if (!(Op.getOpcode() == ISD::FP_EXTEND &&		if (!(Op.getOpcode() == ISD::FP_EXTEND &&
TLI.isFPExtFree(VT, Op.getOperand(0).getValueType())))		TLI.isFPExtFree(VT, Op.getOperand(0).getValueType())))
return 0;		return 0;

// Don't recurse exponentially.		// Don't recurse exponentially.
if (Depth > 6) return 0;		if (Depth > 6) return 0;

Show All 28 Lines	case ISD::FSUB:
if (!Options->NoSignedZerosFPMath &&		if (!Options->NoSignedZerosFPMath &&
!Op.getNode()->getFlags().hasNoSignedZeros())		!Op.getNode()->getFlags().hasNoSignedZeros())
return 0;		return 0;

// fold (fneg (fsub A, B)) -> (fsub B, A)		// fold (fneg (fsub A, B)) -> (fsub B, A)
return 1;		return 1;

case ISD::FMUL:		case ISD::FMUL:
case ISD::FDIV:
if (Options->HonorSignDependentRoundingFPMath()) return 0;		if (Options->HonorSignDependentRoundingFPMath()) return 0;

// fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))		// fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,		if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
Options, Depth + 1))		Options, Depth + 1))
return V;		return V;

return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,		return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
Depth + 1);		Depth + 1);

		case ISD::FDIV:
		if (Options->HonorSignDependentRoundingFPMathOption &&
		!Options->UnsafeFPMath && !Flags.hasNoNaNs()) return 0;

		spatelUnsubmitted Not Done Reply Inline Actions See comments in D47911 - we're adding FMF constraints where there were none before? spatel: See comments in D47911 - we're adding FMF constraints where there were none before?
		// fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
		if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
		Options, Depth + 1))
		return V;

		return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
		Depth + 1);

case ISD::FP_EXTEND:		case ISD::FP_EXTEND:
case ISD::FP_ROUND:		case ISD::FP_ROUND:
case ISD::FSIN:		case ISD::FSIN:
return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options,		return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options,
Depth + 1);		Depth + 1);
}		}
}		}

Show All 38 Lines	if (ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Op.getOperand(0)))
return Op.getOperand(1);		return Op.getOperand(1);

// fold (fneg (fsub A, B)) -> (fsub B, A)		// fold (fneg (fsub A, B)) -> (fsub B, A)
return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),		return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(0), Flags);		Op.getOperand(1), Op.getOperand(0), Flags);

case ISD::FMUL:		case ISD::FMUL:
case ISD::FDIV:		case ISD::FDIV:
assert(!Options.HonorSignDependentRoundingFPMath());		assert(!Options.HonorSignDependentRoundingFPMath() \|\| Flags.hasNoNaNs());

// fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)		// fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
if (isNegatibleForFree(Op.getOperand(0), LegalOperations,		if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
DAG.getTargetLoweringInfo(), &Options, Depth+1))		DAG.getTargetLoweringInfo(), &Options, Depth+1))
return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),		return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
GetNegatedExpression(Op.getOperand(0), DAG,		GetNegatedExpression(Op.getOperand(0), DAG,
LegalOperations, Depth+1),		LegalOperations, Depth+1),
Op.getOperand(1), Flags);		Op.getOperand(1), Flags);
▲ Show 20 Lines • Show All 9,990 Lines • ▼ Show 20 Lines	SDValue DAGCombiner::visitFDIV(SDNode *N) {

// fold (fdiv c1, c2) -> c1/c2		// fold (fdiv c1, c2) -> c1/c2
if (N0CFP && N1CFP)		if (N0CFP && N1CFP)
return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);		return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);

if (SDValue NewSel = foldBinOpIntoSelect(N))		if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;		return NewSel;

if (Options.UnsafeFPMath) {		if (Options.UnsafeFPMath \|\| Flags.hasAllowReciprocal()) {
// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.		// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
if (N1CFP) {		if (N1CFP) {
// Compute the reciprocal 1.0 / c2.		// Compute the reciprocal 1.0 / c2.
const APFloat &N1APF = N1CFP->getValueAPF();		const APFloat &N1APF = N1CFP->getValueAPF();
APFloat Recip(N1APF.getSemantics(), 1); // 1.0		APFloat Recip(N1APF.getSemantics(), 1); // 1.0
APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);		APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
// Only do the transform if the reciprocal is a legal fp immediate that		// Only do the transform if the reciprocal is a legal fp immediate that
// isn't too nasty (eg NaN, denormal, ...).		// isn't too nasty (eg NaN, denormal, ...).
▲ Show 20 Lines • Show All 7,329 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/fdiv.f16.ll

	Show First 20 Lines • Show All 212 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {			define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
	%x = load half, half addrspace(1)* undef			%x = load half, half addrspace(1)* undef
	%rcp = fdiv arcp half %x, 2.0			%rcp = fdiv arcp half %x, 2.0
	store half %rcp, half addrspace(1)* %out, align 4			store half %rcp, half addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:			; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
	; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}}			; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}
				spatelUnsubmitted Not Done Reply Inline Actions I don't know enough about AMDGPU to understand what this diff means. I'd prefer that we add a test for some target where this kind of test currently produces fdiv asm, and this patch will allow it to become an fmul. spatel: I don't know enough about AMDGPU to understand what this diff means. I'd prefer that we add a…
				mcberg2017AuthorUnsubmitted Not Done Reply Inline Actions Working on it, should have a new test in bit... mcberg2017: Working on it, should have a new test in bit...
				arsenmUnsubmitted Not Done Reply Inline Actions Changing the value of the constant here is pretty suspicious arsenm: Changing the value of the constant here is pretty suspicious
				mcberg2017AuthorUnsubmitted Not Done Reply Inline Actions So I walked through the APFloat interface, it does return the correct 16bit value, 0x2E66, which is then converted to F32, and is 0x3dccc000. The old value bypassed conversion in visitFDIV and we given the decimal value of 1/10 which is in 32bit is 0x3dcccccd. The new behavior is more correct than what used to happen since we are being asked to rcp divide approximate a 16bit value. mcberg2017: So I walked through the APFloat interface, it does return the correct 16bit value, 0x2E66…
				mcberg2017AuthorUnsubmitted Not Done Reply Inline Actions Matt and I have discussed this offline noting that the new behavior is correct wrt to the changed constants in the this test. mcberg2017: Matt and I have discussed this offline noting that the new behavior is correct wrt to the…

	; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}			; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
	; GFX8_9: buffer_store_short [[MUL]]			; GFX8_9: buffer_store_short [[MUL]]
	define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {			define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
	%x = load half, half addrspace(1)* undef			%x = load half, half addrspace(1)* undef
	%rcp = fdiv arcp half %x, 10.0			%rcp = fdiv arcp half %x, 10.0
	store half %rcp, half addrspace(1)* %out, align 4			store half %rcp, half addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:			; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
	; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}}			; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}

	; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}			; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
	; GFX8_9: buffer_store_short [[MUL]]			; GFX8_9: buffer_store_short [[MUL]]
	define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {			define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
	%x = load half, half addrspace(1)* undef			%x = load half, half addrspace(1)* undef
	%rcp = fdiv arcp half %x, -10.0			%rcp = fdiv arcp half %x, -10.0
	store half %rcp, half addrspace(1)* %out, align 4			store half %rcp, half addrspace(1)* %out, align 4
	ret void			ret void
	Show All 9 Lines

test/CodeGen/X86/fmf-flags.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown \| FileCheck %s -check-prefix=X64			; RUN: llc < %s -mtriple=x86_64-unknown \| FileCheck %s -check-prefix=X64
	; RUN: llc < %s -mtriple=i686-unknown \| FileCheck %s -check-prefix=X86			; RUN: llc < %s -mtriple=i686-unknown \| FileCheck %s -check-prefix=X86

	declare float @llvm.sqrt.f32(float %x);			declare float @llvm.sqrt.f32(float %x);

	define float @fast_recip_sqrt(float %x) {			define float @fast_recip_sqrt(float %x) {
	; X64-LABEL: fast_recip_sqrt:			; X64-LABEL: fast_recip_sqrt:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: rsqrtss %xmm0, %xmm1			; X64-NEXT: rsqrtss %xmm0, %xmm1
	; X64-NEXT: xorps %xmm2, %xmm2
	; X64-NEXT: cmpeqss %xmm0, %xmm2
	; X64-NEXT: mulss %xmm1, %xmm0			; X64-NEXT: mulss %xmm1, %xmm0
	; X64-NEXT: movss {{.*}}(%rip), %xmm3
	; X64-NEXT: mulss %xmm0, %xmm3
	; X64-NEXT: mulss %xmm1, %xmm0			; X64-NEXT: mulss %xmm1, %xmm0
	; X64-NEXT: addss {{.*}}(%rip), %xmm0			; X64-NEXT: addss {{.*}}(%rip), %xmm0
	; X64-NEXT: mulss %xmm3, %xmm0			; X64-NEXT: mulss {{.*}}(%rip), %xmm1
	; X64-NEXT: andnps %xmm0, %xmm2			; X64-NEXT: mulss %xmm1, %xmm0
	; X64-NEXT: movss {{.*}}(%rip), %xmm0
	; X64-NEXT: divss %xmm2, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
				spatelUnsubmitted Not Done Reply Inline Actions This is an interesting difference. Before this patch, we wouldn't recognize that we could use an estimate from the fdiv node, but we did recognize that we could use an estimate on the square root itself. But at that point, we don't realize that we're actually calculating a reciprocal square root (as opposed to a plain square root estimate)...so we generate the safety check for a 0.0 input. spatel: This is an interesting difference. Before this patch, we wouldn't recognize that we could use…
	;			;
	; X86-LABEL: fast_recip_sqrt:			; X86-LABEL: fast_recip_sqrt:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: flds {{[0-9]+}}(%esp)			; X86-NEXT: flds {{[0-9]+}}(%esp)
	; X86-NEXT: fsqrt			; X86-NEXT: fsqrt
	; X86-NEXT: fld1			; X86-NEXT: fld1
	; X86-NEXT: fdivp %st(1)			; X86-NEXT: fdivp %st(1)
	; X86-NEXT: retl			; X86-NEXT: retl
	▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines

	; The sqrt is strict.			; The sqrt is strict.

	@sqrt1 = common global float 0.000000e+00, align 4			@sqrt1 = common global float 0.000000e+00, align 4

	define float @not_so_fast_recip_sqrt(float %x) {			define float @not_so_fast_recip_sqrt(float %x) {
	; X64-LABEL: not_so_fast_recip_sqrt:			; X64-LABEL: not_so_fast_recip_sqrt:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: sqrtss %xmm0, %xmm1			; X64-NEXT: rsqrtss %xmm0, %xmm1
	; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; X64-NEXT: sqrtss %xmm0, %xmm2
	; X64-NEXT: divss %xmm1, %xmm0			; X64-NEXT: mulss %xmm1, %xmm0
	; X64-NEXT: movss %xmm1, {{.*}}(%rip)			; X64-NEXT: mulss %xmm1, %xmm0
				; X64-NEXT: addss {{.*}}(%rip), %xmm0
				; X64-NEXT: mulss {{.*}}(%rip), %xmm1
				; X64-NEXT: mulss %xmm1, %xmm0
				; X64-NEXT: movss %xmm2, sqrt1(%rip)
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: not_so_fast_recip_sqrt:			; X86-LABEL: not_so_fast_recip_sqrt:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: flds {{[0-9]+}}(%esp)			; X86-NEXT: flds {{[0-9]+}}(%esp)
	; X86-NEXT: fsqrt			; X86-NEXT: fsqrt
	; X86-NEXT: fld1			; X86-NEXT: fld1
	; X86-NEXT: fdiv %st(1)			; X86-NEXT: fdiv %st(1)
	Show All 10 Lines

This is an archive of the discontinued LLVM Phabricator instance.

Utilize new SDNode flag functionality to expand current support for fdiv
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 150542

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

test/CodeGen/AMDGPU/fdiv.f16.ll

test/CodeGen/X86/fmf-flags.ll

This is an archive of the discontinued LLVM Phabricator instance.

Utilize new SDNode flag functionality to expand current support for fdivClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 150542

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

test/CodeGen/AMDGPU/fdiv.f16.ll

test/CodeGen/X86/fmf-flags.ll

Utilize new SDNode flag functionality to expand current support for fdiv
ClosedPublic