Diff 151556

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 10,859 Lines • ▼ Show 20 Lines	SDValue DAGCombiner::visitFDIV(SDNode *N) {

// fold (fdiv c1, c2) -> c1/c2		// fold (fdiv c1, c2) -> c1/c2
if (N0CFP && N1CFP)		if (N0CFP && N1CFP)
return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);		return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);

if (SDValue NewSel = foldBinOpIntoSelect(N))		if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;		return NewSel;

if (Options.UnsafeFPMath) {		if (Options.UnsafeFPMath \|\| Flags.hasAllowReciprocal()) {
// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.		// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
if (N1CFP) {		if (N1CFP) {
// Compute the reciprocal 1.0 / c2.		// Compute the reciprocal 1.0 / c2.
const APFloat &N1APF = N1CFP->getValueAPF();		const APFloat &N1APF = N1CFP->getValueAPF();
APFloat Recip(N1APF.getSemantics(), 1); // 1.0		APFloat Recip(N1APF.getSemantics(), 1); // 1.0
APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);		APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
// Only do the transform if the reciprocal is a legal fp immediate that		// Only do the transform if the reciprocal is a legal fp immediate that
// isn't too nasty (eg NaN, denormal, ...).		// isn't too nasty (eg NaN, denormal, ...).
▲ Show 20 Lines • Show All 7,341 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll

	Show First 20 Lines • Show All 212 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {			define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
	%x = load half, half addrspace(1)* undef			%x = load half, half addrspace(1)* undef
	%rcp = fdiv arcp half %x, 2.0			%rcp = fdiv arcp half %x, 2.0
	store half %rcp, half addrspace(1)* %out, align 4			store half %rcp, half addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:			; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
	; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}}			; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}

	; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}			; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
	; GFX8_9: buffer_store_short [[MUL]]			; GFX8_9: buffer_store_short [[MUL]]
	define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {			define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
	%x = load half, half addrspace(1)* undef			%x = load half, half addrspace(1)* undef
	%rcp = fdiv arcp half %x, 10.0			%rcp = fdiv arcp half %x, 10.0
	store half %rcp, half addrspace(1)* %out, align 4			store half %rcp, half addrspace(1)* %out, align 4
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:			; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
	; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}}			; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}

	; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}			; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
	; GFX8_9: buffer_store_short [[MUL]]			; GFX8_9: buffer_store_short [[MUL]]
	define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {			define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
	%x = load half, half addrspace(1)* undef			%x = load half, half addrspace(1)* undef
	%rcp = fdiv arcp half %x, -10.0			%rcp = fdiv arcp half %x, -10.0
	store half %rcp, half addrspace(1)* %out, align 4			store half %rcp, half addrspace(1)* %out, align 4
	ret void			ret void
	Show All 9 Lines

llvm/trunk/test/CodeGen/X86/fmf-flags.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown \| FileCheck %s -check-prefix=X64			; RUN: llc < %s -mtriple=x86_64-unknown \| FileCheck %s -check-prefix=X64
	; RUN: llc < %s -mtriple=i686-unknown \| FileCheck %s -check-prefix=X86			; RUN: llc < %s -mtriple=i686-unknown \| FileCheck %s -check-prefix=X86

	declare float @llvm.sqrt.f32(float %x);			declare float @llvm.sqrt.f32(float %x);

	define float @fast_recip_sqrt(float %x) {			define float @fast_recip_sqrt(float %x) {
	; X64-LABEL: fast_recip_sqrt:			; X64-LABEL: fast_recip_sqrt:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: rsqrtss %xmm0, %xmm1			; X64-NEXT: rsqrtss %xmm0, %xmm1
	; X64-NEXT: xorps %xmm2, %xmm2
	; X64-NEXT: cmpeqss %xmm0, %xmm2
	; X64-NEXT: mulss %xmm1, %xmm0			; X64-NEXT: mulss %xmm1, %xmm0
	; X64-NEXT: movss {{.*}}(%rip), %xmm3
	; X64-NEXT: mulss %xmm0, %xmm3
	; X64-NEXT: mulss %xmm1, %xmm0			; X64-NEXT: mulss %xmm1, %xmm0
	; X64-NEXT: addss {{.*}}(%rip), %xmm0			; X64-NEXT: addss {{.*}}(%rip), %xmm0
	; X64-NEXT: mulss %xmm3, %xmm0			; X64-NEXT: mulss {{.*}}(%rip), %xmm1
	; X64-NEXT: andnps %xmm0, %xmm2			; X64-NEXT: mulss %xmm1, %xmm0
	; X64-NEXT: movss {{.*}}(%rip), %xmm0
	; X64-NEXT: divss %xmm2, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: fast_recip_sqrt:			; X86-LABEL: fast_recip_sqrt:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: flds {{[0-9]+}}(%esp)			; X86-NEXT: flds {{[0-9]+}}(%esp)
	; X86-NEXT: fsqrt			; X86-NEXT: fsqrt
	; X86-NEXT: fld1			; X86-NEXT: fld1
	; X86-NEXT: fdivp %st(1)			; X86-NEXT: fdivp %st(1)
	▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines

	; The sqrt is strict.			; The sqrt is strict.

	@sqrt1 = common global float 0.000000e+00, align 4			@sqrt1 = common global float 0.000000e+00, align 4

	define float @not_so_fast_recip_sqrt(float %x) {			define float @not_so_fast_recip_sqrt(float %x) {
	; X64-LABEL: not_so_fast_recip_sqrt:			; X64-LABEL: not_so_fast_recip_sqrt:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: sqrtss %xmm0, %xmm1			; X64-NEXT: rsqrtss %xmm0, %xmm1
	; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; X64-NEXT: sqrtss %xmm0, %xmm2
	; X64-NEXT: divss %xmm1, %xmm0			; X64-NEXT: mulss %xmm1, %xmm0
	; X64-NEXT: movss %xmm1, {{.*}}(%rip)			; X64-NEXT: mulss %xmm1, %xmm0
				; X64-NEXT: addss {{.*}}(%rip), %xmm0
				; X64-NEXT: mulss {{.*}}(%rip), %xmm1
				; X64-NEXT: mulss %xmm1, %xmm0
				; X64-NEXT: movss %xmm2, sqrt1(%rip)
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: not_so_fast_recip_sqrt:			; X86-LABEL: not_so_fast_recip_sqrt:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: flds {{[0-9]+}}(%esp)			; X86-NEXT: flds {{[0-9]+}}(%esp)
	; X86-NEXT: fsqrt			; X86-NEXT: fsqrt
	; X86-NEXT: fld1			; X86-NEXT: fld1
	; X86-NEXT: fdiv %st(1)			; X86-NEXT: fdiv %st(1)
	; X86-NEXT: fxch %st(1)			; X86-NEXT: fxch %st(1)
	; X86-NEXT: fstps sqrt1			; X86-NEXT: fstps sqrt1
	; X86-NEXT: retl			; X86-NEXT: retl
	%y = call float @llvm.sqrt.f32(float %x)			%y = call float @llvm.sqrt.f32(float %x)
	%z = fdiv fast float 1.0, %y			%z = fdiv fast float 1.0, %y
	store float %y, float* @sqrt1, align 4			store float %y, float* @sqrt1, align 4
	%ret = fadd float %z , 14.5			%ret = fadd float %z , 14.5
	ret float %z			ret float %z
	}			}

				define float @div_arcp_by_const(half %x) {
				; X64-LABEL: .LCPI4_0:
				; X64-NEXT: .long 1036828672
				; X64-LABEL: div_arcp_by_const:
				; X64: movzwl %ax, %edi
				; X64: mulss .LCPI4_0(%rip), %xmm0
				;
				; X86-LABEL: .LCPI4_0:
				; X86-NEXT: .long 1036828672
				; X86-LABEL: div_arcp_by_const:
				; X86: movzwl %ax, %eax
				; X86: fmuls .LCPI4_0
				%rcp = fdiv arcp half %x, 10.0
				%z = fpext half %rcp to float
				ret float %z
				}

This is an archive of the discontinued LLVM Phabricator instance.

Utilize new SDNode flag functionality to expand current support for fdiv
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 151556

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll

llvm/trunk/test/CodeGen/X86/fmf-flags.ll

This is an archive of the discontinued LLVM Phabricator instance.

Utilize new SDNode flag functionality to expand current support for fdivClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 151556

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll

llvm/trunk/test/CodeGen/X86/fmf-flags.ll

Utilize new SDNode flag functionality to expand current support for fdiv
ClosedPublic