This is an archive of the discontinued LLVM Phabricator instance.

intrinsic management for fast math sub flags
AbandonedPublic

Authored by mcberg2017 on May 7 2018, 5:02 PM.

Download Raw Diff

Details

Reviewers

spatel
wristow

Summary

This patch addresses handling fast math flags for intrinsics and is malleable enough to provide support for new intrinsics with little or no changes after the fact.

Diff Detail

Event Timeline

mcberg2017 created this revision.May 7 2018, 5:02 PM

Herald added a subscriber: nemanjai. · View Herald TranscriptMay 7 2018, 5:02 PM

mcberg2017 added reviewers: spatel, wristow.May 7 2018, 5:03 PM

mcberg2017 added inline comments.

lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
5542	Minor clean up on the braces, it will be in the next upload.

Minor update with some cleanup

spatel mentioned this in rL331992: [x86] fix fmaxnum/fminnum with nnan.May 10 2018, 8:44 AM

spatel mentioned this in rL332155: [DAG] add convenience function to propagate FMF; NFC.May 11 2018, 4:17 PM

Updated for recent changes in the scope of this review...

updated to use the convenience function in r332155

Any other comments on these changes? It would be great to proceed.

In D46563#1098394, @mcberg2017 wrote:

Any other comments on these changes? It would be great to proceed.

I'm still not sold on this approach. IIUC, we're still missing propagation on fcmp, vector reduce, and libcalls.

As I mentioned in D46483, D37686 had a more complete implementation. I drafted a much simpler version of that patch locally. Let me add some tests and clean that up a bit and see if that looks better to you.

spatel mentioned this in D46854: [DAG] propagate FMF for all FPMathOperators.May 14 2018, 3:48 PM

mcberg2017 abandoned this revision.May 14 2018, 4:58 PM

spatel mentioned this in rL332358: [DAG] propagate FMF for all FPMathOperators.May 15 2018, 7:20 AM

Revision Contents

Path

Size

lib/

CodeGen/

SelectionDAG/

SelectionDAGBuilder.cpp

14 lines

test/

CodeGen/

PowerPC/

fmf-propagation.ll

12 lines

X86/

fmaxnum.ll

24 lines

fminnum.ll

24 lines

pr34149.ll

8 lines

Diff 146450

lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,297 Lines • ▼ Show 20 Lines	void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,

if (!I.getType()->isVoidTy()) {		if (!I.getType()->isVoidTy()) {
if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {		if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
EVT VT = TLI.getValueType(DAG.getDataLayout(), PTy);		EVT VT = TLI.getValueType(DAG.getDataLayout(), PTy);
Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);		Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);
} else		} else
Result = lowerRangeToAssertZExt(DAG, I, Result);		Result = lowerRangeToAssertZExt(DAG, I, Result);

		if (auto *FPOp = dyn_cast<FPMathOperator>(&I)) {
		SDNodeFlags Flags;
		Flags.copyFMF(*FPOp);
		Result->setFlags(Flags);
		}
setValue(&I, Result);		setValue(&I, Result);
}		}
}		}

/// GetSignificand - Get the significand and build it into a floating-point		/// GetSignificand - Get the significand and build it into a floating-point
/// number with exponent of 1:		/// number with exponent of 1:
///		///
/// Op = (Op & 0x007fffff) \| 0x3f800000;		/// Op = (Op & 0x007fffff) \| 0x3f800000;
▲ Show 20 Lines • Show All 1,215 Lines • ▼ Show 20 Lines	case Intrinsic::maxnum: {
return nullptr;		return nullptr;
}		}
case Intrinsic::copysign:		case Intrinsic::copysign:
setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,		setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
getValue(I.getArgOperand(0)).getValueType(),		getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0)),		getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1))));		getValue(I.getArgOperand(1))));
return nullptr;		return nullptr;
case Intrinsic::fma:		case Intrinsic::fma:
		mcberg2017AuthorUnsubmitted Not Done Reply Inline Actions Minor clean up on the braces, it will be in the next upload. mcberg2017: Minor clean up on the braces, it will be in the next upload.
setValue(&I, DAG.getNode(ISD::FMA, sdl,		setValue(&I, DAG.getNode(ISD::FMA, sdl,
getValue(I.getArgOperand(0)).getValueType(),		getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0)),		getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1)),		getValue(I.getArgOperand(1)),
getValue(I.getArgOperand(2))));		getValue(I.getArgOperand(2))));
return nullptr;		return nullptr;
case Intrinsic::experimental_constrained_fadd:		case Intrinsic::experimental_constrained_fadd:
case Intrinsic::experimental_constrained_fsub:		case Intrinsic::experimental_constrained_fsub:
▲ Show 20 Lines • Show All 1,215 Lines • ▼ Show 20 Lines	if (F->isDeclaration()) {
// Is this an LLVM intrinsic or a target-specific intrinsic?		// Is this an LLVM intrinsic or a target-specific intrinsic?
unsigned IID = F->getIntrinsicID();		unsigned IID = F->getIntrinsicID();
if (!IID)		if (!IID)
if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo())		if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo())
IID = II->getIntrinsicID(F);		IID = II->getIntrinsicID(F);

if (IID) {		if (IID) {
RenameFn = visitIntrinsicCall(I, IID);		RenameFn = visitIntrinsicCall(I, IID);
if (!RenameFn)		if (!RenameFn) {
		if (auto *FPOp = dyn_cast<FPMathOperator>(&I)) {
		SDNodeFlags Flags;
		Flags.copyFMF(*FPOp);
		SDValue Res = getValue(&I);
		Res->setFlags(Flags);
		}
return;		return;
}		}
}		}
		}

// Check for well-known libc/libm calls. If the function is internal, it		// Check for well-known libc/libm calls. If the function is internal, it
// can't be a library call. Don't do the check if marked as nobuiltin for		// can't be a library call. Don't do the check if marked as nobuiltin for
// some reason or the call site requires strict floating point semantics.		// some reason or the call site requires strict floating point semantics.
LibFunc Func;		LibFunc Func;
if (!I.isNoBuiltin() && !I.isStrictFP() && !F->hasLocalLinkage() &&		if (!I.isNoBuiltin() && !I.isStrictFP() && !F->hasLocalLinkage() &&
F->hasName() && LibInfo->getLibFunc(*F, Func) &&		F->hasName() && LibInfo->getLibFunc(*F, Func) &&
LibInfo->hasOptimizedCodeGen(Func)) {		LibInfo->hasOptimizedCodeGen(Func)) {
▲ Show 20 Lines • Show All 3,378 Lines • Show Last 20 Lines

test/CodeGen/PowerPC/fmf-propagation.ll

Show First 20 Lines • Show All 150 Lines • ▼ Show 20 Lines	; GLOBAL-NEXT: blr
%add = fadd fast float %mul, %z		%add = fadd fast float %mul, %z
ret float %add		ret float %add
}		}

; fma(X, 7.0, X * 42.0) --> X * 49.0		; fma(X, 7.0, X * 42.0) --> X * 49.0
; This is the minimum FMF needed for this transform - the FMA allows reassociation.		; This is the minimum FMF needed for this transform - the FMA allows reassociation.

; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:'		; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:'
; FMFDEBUG: fma {{t[0-9]+}}		; FMFDEBUG: fma reassoc {{t[0-9]+}}
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc1:'		; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc1:'

; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:'		; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc1:'
; GLOBALDEBUG: fmul reassoc {{t[0-9]+}}		; GLOBALDEBUG: fmul reassoc {{t[0-9]+}}
; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc1:'		; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc1:'

define float @fmul_fma_reassoc1(float %x) {		define float @fmul_fma_reassoc1(float %x) {
; FMF-LABEL: fmul_fma_reassoc1:		; FMF-LABEL: fmul_fma_reassoc1:
Show All 19 Lines	; GLOBAL-NEXT: blr
%mul = fmul float %x, 42.0		%mul = fmul float %x, 42.0
%fma = call reassoc float @llvm.fma.f32(float %x, float 7.0, float %mul)		%fma = call reassoc float @llvm.fma.f32(float %x, float 7.0, float %mul)
ret float %fma		ret float %fma
}		}

; This shouldn't change anything - the intermediate fmul result is now also flagged.		; This shouldn't change anything - the intermediate fmul result is now also flagged.

; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:'		; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:'
; FMFDEBUG: fma {{t[0-9]+}}		; FMFDEBUG: fma reassoc {{t[0-9]+}}
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc2:'		; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc2:'

; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:'		; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_reassoc2:'
; GLOBALDEBUG: fmul reassoc {{t[0-9]+}}		; GLOBALDEBUG: fmul reassoc {{t[0-9]+}}
; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc2:'		; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_reassoc2:'

define float @fmul_fma_reassoc2(float %x) {		define float @fmul_fma_reassoc2(float %x) {
; FMF-LABEL: fmul_fma_reassoc2:		; FMF-LABEL: fmul_fma_reassoc2:
Show All 19 Lines	; GLOBAL-NEXT: blr
%mul = fmul reassoc float %x, 42.0		%mul = fmul reassoc float %x, 42.0
%fma = call reassoc float @llvm.fma.f32(float %x, float 7.0, float %mul)		%fma = call reassoc float @llvm.fma.f32(float %x, float 7.0, float %mul)
ret float %fma		ret float %fma
}		}

; The FMA is now fully 'fast'. This implies that reassociation is allowed.		; The FMA is now fully 'fast'. This implies that reassociation is allowed.

; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:'		; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:'
; FMFDEBUG: fma {{t[0-9]+}}		; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}}
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast1:'		; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast1:'

; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:'		; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast1:'
; GLOBALDEBUG: fmul reassoc {{t[0-9]+}}		; GLOBALDEBUG: fmul reassoc {{t[0-9]+}}
; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast1:'		; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast1:'

define float @fmul_fma_fast1(float %x) {		define float @fmul_fma_fast1(float %x) {
; FMF-LABEL: fmul_fma_fast1:		; FMF-LABEL: fmul_fma_fast1:
Show All 19 Lines	; GLOBAL-NEXT: blr
%mul = fmul float %x, 42.0		%mul = fmul float %x, 42.0
%fma = call fast float @llvm.fma.f32(float %x, float 7.0, float %mul)		%fma = call fast float @llvm.fma.f32(float %x, float 7.0, float %mul)
ret float %fma		ret float %fma
}		}

; This shouldn't change anything - the intermediate fmul result is now also flagged.		; This shouldn't change anything - the intermediate fmul result is now also flagged.

; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:'		; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:'
; FMFDEBUG: fma {{t[0-9]+}}		; FMFDEBUG: fma nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}}
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast2:'		; FMFDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast2:'

; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:'		; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'fmul_fma_fast2:'
; GLOBALDEBUG: fmul reassoc {{t[0-9]+}}		; GLOBALDEBUG: fmul reassoc {{t[0-9]+}}
; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast2:'		; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'fmul_fma_fast2:'

define float @fmul_fma_fast2(float %x) {		define float @fmul_fma_fast2(float %x) {
; FMF-LABEL: fmul_fma_fast2:		; FMF-LABEL: fmul_fma_fast2:
Show All 19 Lines	; GLOBAL-NEXT: blr
%mul = fmul fast float %x, 42.0		%mul = fmul fast float %x, 42.0
%fma = call fast float @llvm.fma.f32(float %x, float 7.0, float %mul)		%fma = call fast float @llvm.fma.f32(float %x, float 7.0, float %mul)
ret float %fma		ret float %fma
}		}

; Reduced precision for sqrt is allowed - should use estimate and NR iterations.		; Reduced precision for sqrt is allowed - should use estimate and NR iterations.

; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn:'		; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn:'
; FMFDEBUG: fsqrt {{t[0-9]+}}		; FMFDEBUG: fsqrt afn {{t[0-9]+}}
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn:'		; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn:'

; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn:'		; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn:'
; GLOBALDEBUG: fmul reassoc {{t[0-9]+}}		; GLOBALDEBUG: fmul reassoc {{t[0-9]+}}
; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn:'		; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn:'

define float @sqrt_afn(float %x) {		define float @sqrt_afn(float %x) {
; FMF-LABEL: sqrt_afn:		; FMF-LABEL: sqrt_afn:
Show All 23 Lines
; GLOBAL-NEXT: blr		; GLOBAL-NEXT: blr
%rt = call afn float @llvm.sqrt.f32(float %x)		%rt = call afn float @llvm.sqrt.f32(float %x)
ret float %rt		ret float %rt
}		}

; The call is now fully 'fast'. This implies that approximation is allowed.		; The call is now fully 'fast'. This implies that approximation is allowed.

; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_fast:'		; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_fast:'
; FMFDEBUG: fsqrt {{t[0-9]+}}		; FMFDEBUG: fsqrt nnan ninf nsz arcp contract afn reassoc {{t[0-9]+}}
; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_fast:'		; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_fast:'

; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_fast:'		; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_fast:'
; GLOBALDEBUG: fmul reassoc {{t[0-9]+}}		; GLOBALDEBUG: fmul reassoc {{t[0-9]+}}
; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_fast:'		; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_fast:'

define float @sqrt_fast(float %x) {		define float @sqrt_fast(float %x) {
; FMF-LABEL: sqrt_fast:		; FMF-LABEL: sqrt_fast:
Show All 28 Lines

test/CodeGen/X86/fmaxnum.ll

Show First 20 Lines • Show All 284 Lines • ▼ Show 20 Lines	; AVX-NEXT: retq
ret <8 x double> %z		ret <8 x double> %z
}		}

; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.		; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.

define double @maxnum_intrinsic_nnan_fmf_f64(double %a, double %b) {		define double @maxnum_intrinsic_nnan_fmf_f64(double %a, double %b) {
; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f64:		; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f64:
; SSE: # %bb.0:		; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm2		; SSE-NEXT: maxsd %xmm1, %xmm0
; SSE-NEXT: cmpunordsd %xmm0, %xmm2
; SSE-NEXT: movapd %xmm2, %xmm3
; SSE-NEXT: andpd %xmm1, %xmm3
; SSE-NEXT: maxsd %xmm0, %xmm1
; SSE-NEXT: andnpd %xmm1, %xmm2
; SSE-NEXT: orpd %xmm3, %xmm2
; SSE-NEXT: movapd %xmm2, %xmm0
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f64:		; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f64:
; AVX: # %bb.0:		; AVX: # %bb.0:
; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2		; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq		; AVX-NEXT: retq
%r = tail call nnan double @llvm.maxnum.f64(double %a, double %b)		%r = tail call nnan double @llvm.maxnum.f64(double %a, double %b)
ret double %r		ret double %r
}		}

; FIXME: Make sure vectors work too.		; FIXME: Make sure vectors work too.

define <4 x float> @maxnum_intrinsic_nnan_fmf_f432(<4 x float> %a, <4 x float> %b) {		define <4 x float> @maxnum_intrinsic_nnan_fmf_f432(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f432:		; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f432:
; SSE: # %bb.0:		; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm2		; SSE-NEXT: maxps %xmm1, %xmm0
; SSE-NEXT: maxps %xmm0, %xmm2
; SSE-NEXT: cmpunordps %xmm0, %xmm0
; SSE-NEXT: andps %xmm0, %xmm1
; SSE-NEXT: andnps %xmm2, %xmm0
; SSE-NEXT: orps %xmm1, %xmm0
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f432:		; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f432:
; AVX: # %bb.0:		; AVX: # %bb.0:
; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm2		; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq		; AVX-NEXT: retq
%r = tail call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)		%r = tail call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)
ret <4 x float> %r		ret <4 x float> %r
}		}

; Current (but legacy someday): a function-level attribute should also enable the fold.		; Current (but legacy someday): a function-level attribute should also enable the fold.

define float @maxnum_intrinsic_nnan_attr_f32(float %a, float %b) #0 {		define float @maxnum_intrinsic_nnan_attr_f32(float %a, float %b) #0 {
Show All 31 Lines

test/CodeGen/X86/fminnum.ll

Show First 20 Lines • Show All 276 Lines • ▼ Show 20 Lines	; AVX-NEXT: retq
ret <8 x double> %z		ret <8 x double> %z
}		}

; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.		; FIXME: The IR-level FMF should propagate to the node. With nnan, there's no need to blend.

define float @minnum_intrinsic_nnan_fmf_f32(float %a, float %b) {		define float @minnum_intrinsic_nnan_fmf_f32(float %a, float %b) {
; SSE-LABEL: minnum_intrinsic_nnan_fmf_f32:		; SSE-LABEL: minnum_intrinsic_nnan_fmf_f32:
; SSE: # %bb.0:		; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm2		; SSE-NEXT: minss %xmm1, %xmm0
; SSE-NEXT: cmpunordss %xmm0, %xmm2
; SSE-NEXT: movaps %xmm2, %xmm3
; SSE-NEXT: andps %xmm1, %xmm3
; SSE-NEXT: minss %xmm0, %xmm1
; SSE-NEXT: andnps %xmm1, %xmm2
; SSE-NEXT: orps %xmm3, %xmm2
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX-LABEL: minnum_intrinsic_nnan_fmf_f32:		; AVX-LABEL: minnum_intrinsic_nnan_fmf_f32:
; AVX: # %bb.0:		; AVX: # %bb.0:
; AVX-NEXT: vminss %xmm0, %xmm1, %xmm2		; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq		; AVX-NEXT: retq
%r = tail call nnan float @llvm.minnum.f32(float %a, float %b)		%r = tail call nnan float @llvm.minnum.f32(float %a, float %b)
ret float %r		ret float %r
}		}

; FIXME: Make sure vectors work too.		; FIXME: Make sure vectors work too.

define <2 x double> @minnum_intrinsic_nnan_fmf_v2f64(<2 x double> %a, <2 x double> %b) {		define <2 x double> @minnum_intrinsic_nnan_fmf_v2f64(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: minnum_intrinsic_nnan_fmf_v2f64:		; SSE-LABEL: minnum_intrinsic_nnan_fmf_v2f64:
; SSE: # %bb.0:		; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm1, %xmm2		; SSE-NEXT: minpd %xmm1, %xmm0
; SSE-NEXT: minpd %xmm0, %xmm2
; SSE-NEXT: cmpunordpd %xmm0, %xmm0
; SSE-NEXT: andpd %xmm0, %xmm1
; SSE-NEXT: andnpd %xmm2, %xmm0
; SSE-NEXT: orpd %xmm1, %xmm0
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX-LABEL: minnum_intrinsic_nnan_fmf_v2f64:		; AVX-LABEL: minnum_intrinsic_nnan_fmf_v2f64:
; AVX: # %bb.0:		; AVX: # %bb.0:
; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm2		; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq		; AVX-NEXT: retq
%r = tail call nnan <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b)		%r = tail call nnan <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b)
ret <2 x double> %r		ret <2 x double> %r
}		}

; Current (but legacy someday): a function-level attribute should also enable the fold.		; Current (but legacy someday): a function-level attribute should also enable the fold.

define double @minnum_intrinsic_nnan_attr_f64(double %a, double %b) #0 {		define double @minnum_intrinsic_nnan_attr_f64(double %a, double %b) #0 {
Show All 31 Lines

test/CodeGen/X86/pr34149.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py

	; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell \| FileCheck %s			; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell \| FileCheck %s

	declare <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y)			declare <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y)
	declare <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y)			declare <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y)

	define <4 x double> @via_minnum(<4 x double> %x, <4 x double> %y) {			define <4 x double> @via_minnum(<4 x double> %x, <4 x double> %y) {
	; CHECK-LABEL: via_minnum:			; CHECK-LABEL: via_minnum:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm2			; CHECK-NEXT: vminpd %ymm1, %ymm0, %ymm0
	; CHECK-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
	; CHECK-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%z = call fast <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) readnone			%z = call fast <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
	ret <4 x double> %z			ret <4 x double> %z
	}			}

	define <4 x double> @via_maxnum(<4 x double> %x, <4 x double> %y) {			define <4 x double> @via_maxnum(<4 x double> %x, <4 x double> %y) {
	; CHECK-LABEL: via_maxnum:			; CHECK-LABEL: via_maxnum:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm2			; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
	; CHECK-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
	; CHECK-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%z = call fast <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) readnone			%z = call fast <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
	ret <4 x double> %z			ret <4 x double> %z
	}			}

	define <4 x double> @via_fcmp(<4 x double> %x, <4 x double> %y) {			define <4 x double> @via_fcmp(<4 x double> %x, <4 x double> %y) {
	; CHECK-LABEL: via_fcmp:			; CHECK-LABEL: via_fcmp:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm0			; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm0
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%c = fcmp ule <4 x double> %x, %y			%c = fcmp ule <4 x double> %x, %y
	%z = select <4 x i1> %c, <4 x double> %x, <4 x double> %y			%z = select <4 x i1> %c, <4 x double> %x, <4 x double> %y
	ret <4 x double> %z			ret <4 x double> %z
	}			}