This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] optionally filter out denorms when using frsqrte to calculate sqrt
AbandonedPublic

Authored by spatel on Feb 1 2018, 10:25 AM.

Download Raw Diff

Details

Reviewers

evandro
t.p.northover
scanon
dmgreen
efriedma

Summary

This is a follow-up to D42323 where we noticed that AArch64 did not pick up the change for filtering out denorm inputs to a sqrt estimate. That's because it wasn't using the generic DAGCombiner code to mask off a 0.0 sqrt input.

Note: This patch is only for the (odd?) case where we're ok with using a sqrt estimate, but are not using flush-to-zero mode to kill denorms.

I don't have a system to test the HW behavior (should confirm that frsqrte returns INF with a denorm input?), and my reading of AArch asm isn't good, so please make sure the new code is actually correct.

Diff Detail

Event Timeline

spatel created this revision.Feb 1 2018, 10:25 AM

Herald added subscribers: kristof.beyls, javed.absar, mcrosier and 2 others. · View Herald TranscriptFeb 1 2018, 10:25 AM

frsqrte produces a finite estimate for denormal numbers. From the ARM ARM:

if exp == 0 then
    while fraction<51> == 0 do
        fraction = fraction<50:0> : '0';
        exp = exp - 1;
    fraction = fraction<50:0> : '0';

In D42806#995193, @efriedma wrote:
frsqrte produces a finite estimate for denormal numbers. From the ARM ARM:
if exp == 0 then
    while fraction<51> == 0 do
        fraction = fraction<50:0> : '0';
        exp = exp - 1;
    fraction = fraction<50:0> : '0';

Ah, great - so we don't need this patch. Should still change the select operand to FPZero to produce the 'bic' rather than 'bsl' though?

Should still change the select operand to FPZero to produce the 'bic' rather than 'bsl' though?

I think that changes the result for sqrt(-0.0)? I guess we could still do it under appropriate fast-math flags.

In D42806#995250, @efriedma wrote:

Should still change the select operand to FPZero to produce the 'bic' rather than 'bsl' though?

I think that changes the result for sqrt(-0.0)? I guess we could still do it under appropriate fast-math flags.

Good point. The DAGCombiner estimate generation is only guarded by Options.UnsafeFPMath (doesn't check nodes' flags or NoSignedZerosFPMath), so it's probably wrong currently.

spatel mentioned this in rL323996: [AArch64] remove bogus comment; NFC.Feb 1 2018, 12:01 PM

Revision Contents

Path

Size

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

59 lines

Target/

AArch64/

AArch64ISelLowering.cpp

13 lines

test/

CodeGen/

AArch64/

sqrt-fastmath.ll

143 lines

Diff 132420

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 17,446 Lines • ▼ Show 20 Lines	if (SDValue Est =
TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR,		TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR,
Reciprocal)) {		Reciprocal)) {
AddToWorklist(Est.getNode());		AddToWorklist(Est.getNode());

if (Iterations) {		if (Iterations) {
Est = UseOneConstNR		Est = UseOneConstNR
? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)		? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
: buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);		: buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
		}
if (!Reciprocal) {		if (!Reciprocal) {
// The estimate is now completely wrong if the input was exactly 0.0 or		// The estimate is now completely wrong if the input was exactly 0.0 or
// possibly a denormal. Force the answer to 0.0 for those cases.		// possibly a denormal. Force the answer to 0.0 for those cases.
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
SDLoc DL(Op);		SDLoc DL(Op);
EVT CCVT = getSetCCResultType(VT);		EVT CCVT = getSetCCResultType(VT);
ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;		ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
const Function &F = DAG.getMachineFunction().getFunction();		const Function &F = DAG.getMachineFunction().getFunction();
Attribute Denorms = F.getFnAttribute("denormal-fp-math");		Attribute Denorms = F.getFnAttribute("denormal-fp-math");
if (Denorms.getValueAsString().equals("ieee")) {		if (Denorms.getValueAsString().equals("ieee")) {
// fabs(X) < SmallestNormal ? 0.0 : Est		// fabs(X) < SmallestNormal ? 0.0 : Est
const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);		const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);		APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);		SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);
SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);		SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);		SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);		SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est);		Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est);
AddToWorklist(Fabs.getNode());		AddToWorklist(Fabs.getNode());
AddToWorklist(IsDenorm.getNode());		AddToWorklist(IsDenorm.getNode());
AddToWorklist(Est.getNode());		AddToWorklist(Est.getNode());
} else {		} else {
// X == 0.0 ? 0.0 : Est		// X == 0.0 ? 0.0 : Est
SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);		SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);		SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est);		Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est);
AddToWorklist(IsZero.getNode());		AddToWorklist(IsZero.getNode());
AddToWorklist(Est.getNode());		AddToWorklist(Est.getNode());
}		}
}		}
}
return Est;		return Est;
}		}

return SDValue();		return SDValue();
}		}

SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {		SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {
return buildSqrtEstimateImpl(Op, Flags, true);		return buildSqrtEstimateImpl(Op, Flags, true);
▲ Show 20 Lines • Show All 328 Lines • Show Last 20 Lines

lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,001 Lines • ▼ Show 20 Lines	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)		// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)		// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
for (int i = ExtraSteps; i > 0; --i) {		for (int i = ExtraSteps; i > 0; --i) {
SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,		SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
Flags);		Flags);
Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);		Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);		Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
}		}
// FIXME: This does not detect denorm inputs, so we might produce INF
// when we should produce 0.0. Try to refactor the code in DAGCombiner,
// so we don't have to duplicate it here.
if (!Reciprocal) {
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
VT);
SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);

		if (!Reciprocal)
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);		Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
// Correct the result if the operand is 0.0.
Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
VT, Eq, Operand, Estimate);
}

ExtraSteps = 0;		ExtraSteps = 0;
return Estimate;		return Estimate;
}		}

return SDValue();		return SDValue();
}		}

▲ Show 20 Lines • Show All 6,031 Lines • Show Last 20 Lines

test/CodeGen/AArch64/sqrt-fastmath.ll

	Show All 27 Lines
	; CHECK-NEXT: fmul s1, s1, s2			; CHECK-NEXT: fmul s1, s1, s2
	; CHECK-NEXT: fcmp s0, #0.0			; CHECK-NEXT: fcmp s0, #0.0
	; CHECK-NEXT: fcsel s0, s0, s1, eq			; CHECK-NEXT: fcsel s0, s0, s1, eq
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%1 = tail call fast float @llvm.sqrt.f32(float %a)			%1 = tail call fast float @llvm.sqrt.f32(float %a)
	ret float %1			ret float %1
	}			}

				; CHECK: .LCPI1_0:
				; CHECK-NEXT: .word 8388608 // float 1.17549435E-38

	define float @fsqrt_ieee_denorms(float %a) #1 {			define float @fsqrt_ieee_denorms(float %a) #1 {
	; FAULT-LABEL: fsqrt_ieee_denorms:			; FAULT-LABEL: fsqrt_ieee_denorms:
	; FAULT: // %bb.0:			; FAULT: // %bb.0:
	; FAULT-NEXT: fsqrt s0, s0			; FAULT-NEXT: fsqrt s0, s0
	; FAULT-NEXT: ret			; FAULT-NEXT: ret
	;			;
	; CHECK-LABEL: fsqrt_ieee_denorms:			; CHECK-LABEL: fsqrt_ieee_denorms:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
				; CHECK-NEXT: adrp x8, .LCPI1_0
				; CHECK-NEXT: ldr s1, [x8, :lo12:.LCPI1_0]
				; CHECK-NEXT: fabs s2, s0
				; CHECK-NEXT: fcmp s2, s1
	; CHECK-NEXT: frsqrte s1, s0			; CHECK-NEXT: frsqrte s1, s0
	; CHECK-NEXT: fmul s2, s1, s1			; CHECK-NEXT: fmul s2, s1, s1
	; CHECK-NEXT: frsqrts s2, s0, s2			; CHECK-NEXT: frsqrts s2, s0, s2
	; CHECK-NEXT: fmul s1, s1, s2			; CHECK-NEXT: fmul s1, s1, s2
	; CHECK-NEXT: fmul s2, s1, s1			; CHECK-NEXT: fmul s2, s1, s1
	; CHECK-NEXT: frsqrts s2, s0, s2			; CHECK-NEXT: frsqrts s2, s0, s2
	; CHECK-NEXT: fmul s2, s2, s0			; CHECK-NEXT: fmul s0, s2, s0
	; CHECK-NEXT: fmul s1, s1, s2			; CHECK-NEXT: fmul s0, s1, s0
	; CHECK-NEXT: fcmp s0, #0.0			; CHECK-NEXT: fmov s1, wzr
	; CHECK-NEXT: fcsel s0, s0, s1, eq			; CHECK-NEXT: fcsel s0, s1, s0, lt
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%1 = tail call fast float @llvm.sqrt.f32(float %a)			%1 = tail call fast float @llvm.sqrt.f32(float %a)
	ret float %1			ret float %1
	}			}

	define <2 x float> @f2sqrt(<2 x float> %a) #0 {			define <2 x float> @f2sqrt(<2 x float> %a) #0 {
	; FAULT-LABEL: f2sqrt:			; FAULT-LABEL: f2sqrt:
	; FAULT: // %bb.0:			; FAULT: // %bb.0:
	; FAULT-NEXT: fsqrt v0.2s, v0.2s			; FAULT-NEXT: fsqrt v0.2s, v0.2s
	; FAULT-NEXT: ret			; FAULT-NEXT: ret
	;			;
	; CHECK-LABEL: f2sqrt:			; CHECK-LABEL: f2sqrt:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: frsqrte v1.2s, v0.2s			; CHECK-NEXT: frsqrte v1.2s, v0.2s
	; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s			; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s
	; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s			; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s
	; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s			; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s
	; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s			; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s
	; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s			; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s
	; CHECK-NEXT: fmul v2.2s, v2.2s, v0.2s			; CHECK-NEXT: fmul v2.2s, v2.2s, v0.2s
	; CHECK-NEXT: fmul v2.2s, v1.2s, v2.2s			; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s
	; CHECK-NEXT: fcmeq v1.2s, v0.2s, #0.0			; CHECK-NEXT: fcmeq v0.2s, v0.2s, #0.0
	; CHECK-NEXT: bsl v1.8b, v0.8b, v2.8b			; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b
	; CHECK-NEXT: mov v0.16b, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a)			%1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a)
	ret <2 x float> %1			ret <2 x float> %1
	}			}

	define <4 x float> @f4sqrt(<4 x float> %a) #0 {			define <4 x float> @f4sqrt(<4 x float> %a) #0 {
	; FAULT-LABEL: f4sqrt:			; FAULT-LABEL: f4sqrt:
	; FAULT: // %bb.0:			; FAULT: // %bb.0:
	; FAULT-NEXT: fsqrt v0.4s, v0.4s			; FAULT-NEXT: fsqrt v0.4s, v0.4s
	; FAULT-NEXT: ret			; FAULT-NEXT: ret
	;			;
	; CHECK-LABEL: f4sqrt:			; CHECK-LABEL: f4sqrt:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: frsqrte v1.4s, v0.4s			; CHECK-NEXT: frsqrte v1.4s, v0.4s
	; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s			; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s
	; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s			; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s
	; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s			; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s
	; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s			; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s
	; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s			; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s
	; CHECK-NEXT: fmul v2.4s, v2.4s, v0.4s			; CHECK-NEXT: fmul v2.4s, v2.4s, v0.4s
	; CHECK-NEXT: fmul v2.4s, v1.4s, v2.4s			; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s
	; CHECK-NEXT: fcmeq v1.4s, v0.4s, #0.0			; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0
	; CHECK-NEXT: bsl v1.16b, v0.16b, v2.16b			; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
	; CHECK-NEXT: mov v0.16b, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)			%1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
	ret <4 x float> %1			ret <4 x float> %1
	}			}

	define <8 x float> @f8sqrt(<8 x float> %a) #0 {			define <8 x float> @f8sqrt(<8 x float> %a) #0 {
	; FAULT-LABEL: f8sqrt:			; FAULT-LABEL: f8sqrt:
	; FAULT: // %bb.0:			; FAULT: // %bb.0:
	; FAULT-NEXT: fsqrt v0.4s, v0.4s			; FAULT-NEXT: fsqrt v0.4s, v0.4s
	; FAULT-NEXT: fsqrt v1.4s, v1.4s			; FAULT-NEXT: fsqrt v1.4s, v1.4s
	; FAULT-NEXT: ret			; FAULT-NEXT: ret
	;			;
	; CHECK-LABEL: f8sqrt:			; CHECK-LABEL: f8sqrt:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: frsqrte v2.4s, v0.4s			; CHECK-NEXT: frsqrte v2.4s, v0.4s
	; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s			; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s
	; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s			; CHECK-NEXT: frsqrte v3.4s, v1.4s
	; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s			; CHECK-NEXT: frsqrts v4.4s, v0.4s, v4.4s
	; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s			; CHECK-NEXT: fmul v2.4s, v2.4s, v4.4s
	; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s			; CHECK-NEXT: fmul v4.4s, v3.4s, v3.4s
	; CHECK-NEXT: fmul v3.4s, v3.4s, v0.4s			; CHECK-NEXT: frsqrts v4.4s, v1.4s, v4.4s
	; CHECK-NEXT: fmul v3.4s, v2.4s, v3.4s			; CHECK-NEXT: fmul v3.4s, v3.4s, v4.4s
	; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0			; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s
	; CHECK-NEXT: bsl v2.16b, v0.16b, v3.16b			; CHECK-NEXT: frsqrts v4.4s, v0.4s, v4.4s
	; CHECK-NEXT: frsqrte v0.4s, v1.4s			; CHECK-NEXT: fmul v4.4s, v4.4s, v0.4s
	; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s			; CHECK-NEXT: fmul v2.4s, v2.4s, v4.4s
	; CHECK-NEXT: frsqrts v3.4s, v1.4s, v3.4s			; CHECK-NEXT: fmul v4.4s, v3.4s, v3.4s
	; CHECK-NEXT: fmul v0.4s, v0.4s, v3.4s			; CHECK-NEXT: frsqrts v4.4s, v1.4s, v4.4s
	; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s			; CHECK-NEXT: fmul v4.4s, v4.4s, v1.4s
	; CHECK-NEXT: frsqrts v3.4s, v1.4s, v3.4s			; CHECK-NEXT: fmul v3.4s, v3.4s, v4.4s
	; CHECK-NEXT: fmul v3.4s, v3.4s, v1.4s			; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0
	; CHECK-NEXT: fmul v0.4s, v0.4s, v3.4s			; CHECK-NEXT: fcmeq v1.4s, v1.4s, #0.0
	; CHECK-NEXT: fcmeq v3.4s, v1.4s, #0.0			; CHECK-NEXT: bic v0.16b, v2.16b, v0.16b
	; CHECK-NEXT: bsl v3.16b, v1.16b, v0.16b			; CHECK-NEXT: bic v1.16b, v3.16b, v1.16b
	; CHECK-NEXT: mov v0.16b, v2.16b
	; CHECK-NEXT: mov v1.16b, v3.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%1 = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %a)			%1 = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %a)
	ret <8 x float> %1			ret <8 x float> %1
	}			}

	define double @dsqrt(double %a) #0 {			define double @dsqrt(double %a) #0 {
	; FAULT-LABEL: dsqrt:			; FAULT-LABEL: dsqrt:
	; FAULT: // %bb.0:			; FAULT: // %bb.0:
	Show All 15 Lines
	; CHECK-NEXT: fmul d1, d1, d2			; CHECK-NEXT: fmul d1, d1, d2
	; CHECK-NEXT: fcmp d0, #0.0			; CHECK-NEXT: fcmp d0, #0.0
	; CHECK-NEXT: fcsel d0, d0, d1, eq			; CHECK-NEXT: fcsel d0, d0, d1, eq
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%1 = tail call fast double @llvm.sqrt.f64(double %a)			%1 = tail call fast double @llvm.sqrt.f64(double %a)
	ret double %1			ret double %1
	}			}

				; CHECK: .LCPI6_0:
				; CHECK-NEXT: .xword 4503599627370496 // double 2.2250738585072014E-308

	define double @dsqrt_ieee_denorms(double %a) #1 {			define double @dsqrt_ieee_denorms(double %a) #1 {
	; FAULT-LABEL: dsqrt_ieee_denorms:			; FAULT-LABEL: dsqrt_ieee_denorms:
	; FAULT: // %bb.0:			; FAULT: // %bb.0:
	; FAULT-NEXT: fsqrt d0, d0			; FAULT-NEXT: fsqrt d0, d0
	; FAULT-NEXT: ret			; FAULT-NEXT: ret
	;			;
	; CHECK-LABEL: dsqrt_ieee_denorms:			; CHECK-LABEL: dsqrt_ieee_denorms:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
				; CHECK-NEXT: adrp x8, .LCPI6_0
				; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI6_0]
				; CHECK-NEXT: fabs d2, d0
				; CHECK-NEXT: fcmp d2, d1
	; CHECK-NEXT: frsqrte d1, d0			; CHECK-NEXT: frsqrte d1, d0
	; CHECK-NEXT: fmul d2, d1, d1			; CHECK-NEXT: fmul d2, d1, d1
	; CHECK-NEXT: frsqrts d2, d0, d2			; CHECK-NEXT: frsqrts d2, d0, d2
	; CHECK-NEXT: fmul d1, d1, d2			; CHECK-NEXT: fmul d1, d1, d2
	; CHECK-NEXT: fmul d2, d1, d1			; CHECK-NEXT: fmul d2, d1, d1
	; CHECK-NEXT: frsqrts d2, d0, d2			; CHECK-NEXT: frsqrts d2, d0, d2
	; CHECK-NEXT: fmul d1, d1, d2			; CHECK-NEXT: fmul d1, d1, d2
	; CHECK-NEXT: fmul d2, d1, d1			; CHECK-NEXT: fmul d2, d1, d1
	; CHECK-NEXT: frsqrts d2, d0, d2			; CHECK-NEXT: frsqrts d2, d0, d2
	; CHECK-NEXT: fmul d2, d2, d0			; CHECK-NEXT: fmul d0, d2, d0
	; CHECK-NEXT: fmul d1, d1, d2			; CHECK-NEXT: fmul d0, d1, d0
	; CHECK-NEXT: fcmp d0, #0.0			; CHECK-NEXT: fmov d1, xzr
	; CHECK-NEXT: fcsel d0, d0, d1, eq			; CHECK-NEXT: fcsel d0, d1, d0, lt
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%1 = tail call fast double @llvm.sqrt.f64(double %a)			%1 = tail call fast double @llvm.sqrt.f64(double %a)
	ret double %1			ret double %1
	}			}

	define <2 x double> @d2sqrt(<2 x double> %a) #0 {			define <2 x double> @d2sqrt(<2 x double> %a) #0 {
	; FAULT-LABEL: d2sqrt:			; FAULT-LABEL: d2sqrt:
	; FAULT: // %bb.0:			; FAULT: // %bb.0:
	; FAULT-NEXT: fsqrt v0.2d, v0.2d			; FAULT-NEXT: fsqrt v0.2d, v0.2d
	; FAULT-NEXT: ret			; FAULT-NEXT: ret
	;			;
	; CHECK-LABEL: d2sqrt:			; CHECK-LABEL: d2sqrt:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: frsqrte v1.2d, v0.2d			; CHECK-NEXT: frsqrte v1.2d, v0.2d
	; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d			; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d
	; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d			; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d
	; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d			; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d
	; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d			; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d
	; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d			; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d
	; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d			; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d
	; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d			; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d
	; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d			; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d
	; CHECK-NEXT: fmul v2.2d, v2.2d, v0.2d			; CHECK-NEXT: fmul v2.2d, v2.2d, v0.2d
	; CHECK-NEXT: fmul v2.2d, v1.2d, v2.2d			; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d
	; CHECK-NEXT: fcmeq v1.2d, v0.2d, #0.0			; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0
	; CHECK-NEXT: bsl v1.16b, v0.16b, v2.16b			; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
	; CHECK-NEXT: mov v0.16b, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a)			%1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a)
	ret <2 x double> %1			ret <2 x double> %1
	}			}

	define <4 x double> @d4sqrt(<4 x double> %a) #0 {			define <4 x double> @d4sqrt(<4 x double> %a) #0 {
	; FAULT-LABEL: d4sqrt:			; FAULT-LABEL: d4sqrt:
	; FAULT: // %bb.0:			; FAULT: // %bb.0:
	; FAULT-NEXT: fsqrt v0.2d, v0.2d			; FAULT-NEXT: fsqrt v0.2d, v0.2d
	; FAULT-NEXT: fsqrt v1.2d, v1.2d			; FAULT-NEXT: fsqrt v1.2d, v1.2d
	; FAULT-NEXT: ret			; FAULT-NEXT: ret
	;			;
	; CHECK-LABEL: d4sqrt:			; CHECK-LABEL: d4sqrt:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: frsqrte v2.2d, v0.2d			; CHECK-NEXT: frsqrte v2.2d, v0.2d
	; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d			; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d
	; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d			; CHECK-NEXT: frsqrte v3.2d, v1.2d
	; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d			; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d
	; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d			; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d
	; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d			; CHECK-NEXT: fmul v4.2d, v3.2d, v3.2d
	; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d			; CHECK-NEXT: frsqrts v4.2d, v1.2d, v4.2d
	; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d			; CHECK-NEXT: fmul v3.2d, v3.2d, v4.2d
	; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d			; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d
	; CHECK-NEXT: fmul v3.2d, v3.2d, v0.2d			; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d
	; CHECK-NEXT: fmul v3.2d, v2.2d, v3.2d			; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d
	; CHECK-NEXT: fcmeq v2.2d, v0.2d, #0.0			; CHECK-NEXT: fmul v4.2d, v3.2d, v3.2d
	; CHECK-NEXT: bsl v2.16b, v0.16b, v3.16b			; CHECK-NEXT: frsqrts v4.2d, v1.2d, v4.2d
	; CHECK-NEXT: frsqrte v0.2d, v1.2d			; CHECK-NEXT: fmul v3.2d, v3.2d, v4.2d
	; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d			; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d
	; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d			; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d
	; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d			; CHECK-NEXT: fmul v4.2d, v4.2d, v0.2d
	; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d			; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d
	; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d			; CHECK-NEXT: fmul v4.2d, v3.2d, v3.2d
	; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d			; CHECK-NEXT: frsqrts v4.2d, v1.2d, v4.2d
	; CHECK-NEXT: fmul v3.2d, v0.2d, v0.2d			; CHECK-NEXT: fmul v4.2d, v4.2d, v1.2d
	; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d			; CHECK-NEXT: fmul v3.2d, v3.2d, v4.2d
	; CHECK-NEXT: fmul v3.2d, v3.2d, v1.2d			; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0
	; CHECK-NEXT: fmul v0.2d, v0.2d, v3.2d			; CHECK-NEXT: fcmeq v1.2d, v1.2d, #0.0
	; CHECK-NEXT: fcmeq v3.2d, v1.2d, #0.0			; CHECK-NEXT: bic v0.16b, v2.16b, v0.16b
	; CHECK-NEXT: bsl v3.16b, v1.16b, v0.16b			; CHECK-NEXT: bic v1.16b, v3.16b, v1.16b
	; CHECK-NEXT: mov v0.16b, v2.16b
	; CHECK-NEXT: mov v1.16b, v3.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%1 = tail call fast <4 x double> @llvm.sqrt.v4f64(<4 x double> %a)			%1 = tail call fast <4 x double> @llvm.sqrt.v4f64(<4 x double> %a)
	ret <4 x double> %1			ret <4 x double> %1
	}			}

	define float @frsqrt(float %a) #0 {			define float @frsqrt(float %a) #0 {
	; FAULT-LABEL: frsqrt:			; FAULT-LABEL: frsqrt:
	; FAULT: // %bb.0:			; FAULT: // %bb.0:
	▲ Show 20 Lines • Show All 191 Lines • Show Last 20 Lines