Diff 151429

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 734 Lines • ▼ Show 20 Lines	case ISD::FDIV:
if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,		if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
Options, Depth + 1))		Options, Depth + 1))
return V;		return V;

return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,		return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
Depth + 1);		Depth + 1);

case ISD::FP_EXTEND:		case ISD::FP_EXTEND:
case ISD::FP_ROUND:		case ISD::FP_ROUND:
case ISD::FSIN:		case ISD::FSIN:
return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options,		return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options,
		spatelUnsubmitted Not Done Reply Inline Actions See comments in D47911 - we're adding FMF constraints where there were none before? spatel: See comments in D47911 - we're adding FMF constraints where there were none before?
Depth + 1);		Depth + 1);
}		}
}		}

/// If isNegatibleForFree returns true, return the newly negated expression.		/// If isNegatibleForFree returns true, return the newly negated expression.
static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,		static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
bool LegalOperations, unsigned Depth = 0) {		bool LegalOperations, unsigned Depth = 0) {
const TargetOptions &Options = DAG.getTarget().Options;		const TargetOptions &Options = DAG.getTarget().Options;
▲ Show 20 Lines • Show All 10,110 Lines • ▼ Show 20 Lines	SDValue DAGCombiner::visitFDIV(SDNode *N) {

// fold (fdiv c1, c2) -> c1/c2		// fold (fdiv c1, c2) -> c1/c2
if (N0CFP && N1CFP)		if (N0CFP && N1CFP)
return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);		return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);

if (SDValue NewSel = foldBinOpIntoSelect(N))		if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;		return NewSel;

if (Options.UnsafeFPMath) {		if (Options.UnsafeFPMath \|\| Flags.hasAllowReciprocal()) {
// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.		// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
if (N1CFP) {		if (N1CFP) {
// Compute the reciprocal 1.0 / c2.		// Compute the reciprocal 1.0 / c2.
const APFloat &N1APF = N1CFP->getValueAPF();		const APFloat &N1APF = N1CFP->getValueAPF();
APFloat Recip(N1APF.getSemantics(), 1); // 1.0		APFloat Recip(N1APF.getSemantics(), 1); // 1.0
APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);		APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
// Only do the transform if the reciprocal is a legal fp immediate that		// Only do the transform if the reciprocal is a legal fp immediate that
// isn't too nasty (eg NaN, denormal, ...).		// isn't too nasty (eg NaN, denormal, ...).
▲ Show 20 Lines • Show All 7,338 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/fdiv.f16.ll

	Show First 20 Lines • Show All 212 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {			define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
	%x = load half, half addrspace(1)* undef			%x = load half, half addrspace(1)* undef
	%rcp = fdiv arcp half %x, 2.0			%rcp = fdiv arcp half %x, 2.0
	store half %rcp, half addrspace(1)* %out, align 4			store half %rcp, half addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:			; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
	; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}}			; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}
				spatelUnsubmitted Not Done Reply Inline Actions I don't know enough about AMDGPU to understand what this diff means. I'd prefer that we add a test for some target where this kind of test currently produces fdiv asm, and this patch will allow it to become an fmul. spatel: I don't know enough about AMDGPU to understand what this diff means. I'd prefer that we add a…
				mcberg2017AuthorUnsubmitted Not Done Reply Inline Actions Working on it, should have a new test in bit... mcberg2017: Working on it, should have a new test in bit...
				arsenmUnsubmitted Not Done Reply Inline Actions Changing the value of the constant here is pretty suspicious arsenm: Changing the value of the constant here is pretty suspicious
				mcberg2017AuthorUnsubmitted Not Done Reply Inline Actions So I walked through the APFloat interface, it does return the correct 16bit value, 0x2E66, which is then converted to F32, and is 0x3dccc000. The old value bypassed conversion in visitFDIV and we given the decimal value of 1/10 which is in 32bit is 0x3dcccccd. The new behavior is more correct than what used to happen since we are being asked to rcp divide approximate a 16bit value. mcberg2017: So I walked through the APFloat interface, it does return the correct 16bit value, 0x2E66…
				mcberg2017AuthorUnsubmitted Not Done Reply Inline Actions Matt and I have discussed this offline noting that the new behavior is correct wrt to the changed constants in the this test. mcberg2017: Matt and I have discussed this offline noting that the new behavior is correct wrt to the…

	; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}			; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
	; GFX8_9: buffer_store_short [[MUL]]			; GFX8_9: buffer_store_short [[MUL]]
	define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {			define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
	%x = load half, half addrspace(1)* undef			%x = load half, half addrspace(1)* undef
	%rcp = fdiv arcp half %x, 10.0			%rcp = fdiv arcp half %x, 10.0
	store half %rcp, half addrspace(1)* %out, align 4			store half %rcp, half addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:			; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
	; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}}			; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}

	; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}			; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
	; GFX8_9: buffer_store_short [[MUL]]			; GFX8_9: buffer_store_short [[MUL]]
	define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {			define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
	%x = load half, half addrspace(1)* undef			%x = load half, half addrspace(1)* undef
	%rcp = fdiv arcp half %x, -10.0			%rcp = fdiv arcp half %x, -10.0
	store half %rcp, half addrspace(1)* %out, align 4			store half %rcp, half addrspace(1)* %out, align 4
	ret void			ret void
	Show All 9 Lines

test/CodeGen/X86/fmf-flags.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown \| FileCheck %s -check-prefix=X64			; RUN: llc < %s -mtriple=x86_64-unknown \| FileCheck %s -check-prefix=X64
	; RUN: llc < %s -mtriple=i686-unknown \| FileCheck %s -check-prefix=X86			; RUN: llc < %s -mtriple=i686-unknown \| FileCheck %s -check-prefix=X86

	declare float @llvm.sqrt.f32(float %x);			declare float @llvm.sqrt.f32(float %x);

	define float @fast_recip_sqrt(float %x) {			define float @fast_recip_sqrt(float %x) {
	; X64-LABEL: fast_recip_sqrt:			; X64-LABEL: fast_recip_sqrt:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: rsqrtss %xmm0, %xmm1			; X64-NEXT: rsqrtss %xmm0, %xmm1
	; X64-NEXT: xorps %xmm2, %xmm2
	; X64-NEXT: cmpeqss %xmm0, %xmm2
	; X64-NEXT: mulss %xmm1, %xmm0			; X64-NEXT: mulss %xmm1, %xmm0
	; X64-NEXT: movss {{.*}}(%rip), %xmm3
	; X64-NEXT: mulss %xmm0, %xmm3
	; X64-NEXT: mulss %xmm1, %xmm0			; X64-NEXT: mulss %xmm1, %xmm0
	; X64-NEXT: addss {{.*}}(%rip), %xmm0			; X64-NEXT: addss {{.*}}(%rip), %xmm0
	; X64-NEXT: mulss %xmm3, %xmm0			; X64-NEXT: mulss {{.*}}(%rip), %xmm1
	; X64-NEXT: andnps %xmm0, %xmm2			; X64-NEXT: mulss %xmm1, %xmm0
				spatelUnsubmitted Not Done Reply Inline Actions This is an interesting difference. Before this patch, we wouldn't recognize that we could use an estimate from the fdiv node, but we did recognize that we could use an estimate on the square root itself. But at that point, we don't realize that we're actually calculating a reciprocal square root (as opposed to a plain square root estimate)...so we generate the safety check for a 0.0 input. spatel: This is an interesting difference. Before this patch, we wouldn't recognize that we could use…
	; X64-NEXT: movss {{.*}}(%rip), %xmm0
	; X64-NEXT: divss %xmm2, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: fast_recip_sqrt:			; X86-LABEL: fast_recip_sqrt:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: flds {{[0-9]+}}(%esp)			; X86-NEXT: flds {{[0-9]+}}(%esp)
	; X86-NEXT: fsqrt			; X86-NEXT: fsqrt
	; X86-NEXT: fld1			; X86-NEXT: fld1
	; X86-NEXT: fdivp %st(1)			; X86-NEXT: fdivp %st(1)
	▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines

	; The sqrt is strict.			; The sqrt is strict.

	@sqrt1 = common global float 0.000000e+00, align 4			@sqrt1 = common global float 0.000000e+00, align 4

	define float @not_so_fast_recip_sqrt(float %x) {			define float @not_so_fast_recip_sqrt(float %x) {
	; X64-LABEL: not_so_fast_recip_sqrt:			; X64-LABEL: not_so_fast_recip_sqrt:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: sqrtss %xmm0, %xmm1			; X64-NEXT: rsqrtss %xmm0, %xmm1
	; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; X64-NEXT: sqrtss %xmm0, %xmm2
	; X64-NEXT: divss %xmm1, %xmm0			; X64-NEXT: mulss %xmm1, %xmm0
	; X64-NEXT: movss %xmm1, {{.*}}(%rip)			; X64-NEXT: mulss %xmm1, %xmm0
				; X64-NEXT: addss {{.*}}(%rip), %xmm0
				; X64-NEXT: mulss {{.*}}(%rip), %xmm1
				; X64-NEXT: mulss %xmm1, %xmm0
				; X64-NEXT: movss %xmm2, sqrt1(%rip)
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: not_so_fast_recip_sqrt:			; X86-LABEL: not_so_fast_recip_sqrt:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: flds {{[0-9]+}}(%esp)			; X86-NEXT: flds {{[0-9]+}}(%esp)
	; X86-NEXT: fsqrt			; X86-NEXT: fsqrt
	; X86-NEXT: fld1			; X86-NEXT: fld1
	; X86-NEXT: fdiv %st(1)			; X86-NEXT: fdiv %st(1)
	; X86-NEXT: fxch %st(1)			; X86-NEXT: fxch %st(1)
	; X86-NEXT: fstps sqrt1			; X86-NEXT: fstps sqrt1
	; X86-NEXT: retl			; X86-NEXT: retl
	%y = call float @llvm.sqrt.f32(float %x)			%y = call float @llvm.sqrt.f32(float %x)
	%z = fdiv fast float 1.0, %y			%z = fdiv fast float 1.0, %y
	store float %y, float* @sqrt1, align 4			store float %y, float* @sqrt1, align 4
	%ret = fadd float %z , 14.5			%ret = fadd float %z , 14.5
	ret float %z			ret float %z
	}			}

				define float @div_arcp_by_const(half %x) {
				; X64-LABEL: .LCPI4_0:
				; X64-NEXT: .long 1036828672
				; X64-LABEL: div_arcp_by_const:
				; X64: movzwl %ax, %edi
				; X64: mulss .LCPI4_0(%rip), %xmm0
				;
				; X86-LABEL: .LCPI4_0:
				; X86-NEXT: .long 1036828672
				; X86-LABEL: div_arcp_by_const:
				; X86: movzwl %ax, %eax
				; X86: fmuls .LCPI4_0
				%rcp = fdiv arcp half %x, 10.0
				%z = fpext half %rcp to float
				ret float %z
				}

This is an archive of the discontinued LLVM Phabricator instance.

Utilize new SDNode flag functionality to expand current support for fdiv
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 151429

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

test/CodeGen/AMDGPU/fdiv.f16.ll

test/CodeGen/X86/fmf-flags.ll

This is an archive of the discontinued LLVM Phabricator instance.

Utilize new SDNode flag functionality to expand current support for fdivClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 151429

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

test/CodeGen/AMDGPU/fdiv.f16.ll

test/CodeGen/X86/fmf-flags.ll

Utilize new SDNode flag functionality to expand current support for fdiv
ClosedPublic