This is an archive of the discontinued LLVM Phabricator instance.

Improve sqrt estimate algorithm
ClosedPublic

Authored by spatel on Oct 8 2014, 1:53 PM.

Download Raw Diff

Details

Reviewers

willschm
wschmidt
hfinkel

Commits

rG3d497cd77826: Improve sqrt estimate algorithm (fast-math)
rL219445: Improve sqrt estimate algorithm (fast-math)

Summary

This patch changes the fast-math implementation for calculating sqrt(x) from:
y = 1 / (1 / sqrt(x))
to:
y = x * (1 / sqrt(x))

This has 2 benefits: less code / faster code and one less estimate instruction that may lose precision.

The only target that will be affected (until http://reviews.llvm.org/D5658 is approved) is PPC. The difference in codegen for PPC is 2 less flops for a single-precision sqrtf or vector sqrtf and 4 less flops for a double-precision sqrt. We also eliminate a constant load and extra register usage.

Here's the existing PPC codegen for a single-precision scalar sqrtf() using a reciprocal square root estimate and a reciprocal estimate:

.L.goo3:
# BB#0:
   addis 3, 2, .LCPI10_2@toc@ha
   lfs 0, .LCPI10_2@toc@l(3)
   fcmpu 0, 1, 0
   beq 0, .LBB10_2
# BB#1:
   frsqrtes 0, 1
   addis 3, 2, .LCPI10_0@toc@ha
   lfs 2, .LCPI10_0@toc@l(3)
   addis 3, 2, .LCPI10_1@toc@ha
   lfs 13, .LCPI10_1@toc@l(3)
   fnmsubs 12, 1, 2, 1
   fmuls 3, 0, 0
   fmadds 1, 12, 3, 2
   fmuls 0, 0, 1
   fres 1, 0                    <--- reciprocal estimate
   fnmsubs 0, 0, 1, 13  <--- refinement
   fmadds 0, 1, 0, 1      <--- refinement
.LBB10_2:
   fmr 1, 0
   blr

After the patch, we calculate the rsqrt and multiply by the original operand:

.L.goo3:
# BB#0:
   addis 3, 2, .LCPI10_1@toc@ha
   lfs 0, .LCPI10_1@toc@l(3)
   fcmpu 0, 1, 0
   beq 0, .LBB10_2
# BB#1:
   frsqrtes 0, 1
   addis 3, 2, .LCPI10_0@toc@ha
   lfs 2, .LCPI10_0@toc@l(3)   <--- only need 1 constant for NR sqrt refinement
   fnmsubs 3, 1, 2, 1
   fmuls 4, 0, 0
   fmadds 2, 3, 4, 2
   fmuls 0, 0, 2
   fmuls 0, 1, 0   <--- reciprocal calc replaced by multiply
.LBB10_2:
   fmr 1, 0
   blr

Diff Detail

Repository: rL LLVM

Event Timeline

spatel updated this revision to Diff 14600.Oct 8 2014, 1:53 PM

spatel retitled this revision from to Improve sqrt estimate algorithm.

spatel updated this object.

spatel edited the test plan for this revision. (Show Details)

spatel added reviewers: hfinkel, wschmidt, willschm.

spatel added a subscriber: Unknown Object (MLST).

Awesome, thanks! (LGTM)

-Hal

This revision is now accepted and ready to land.Oct 9 2014, 12:53 PM

Thanks, Hal!

One other comment I'd like to point out here regarding the PPC codegen: I was expecting an 'fsel' to be generated rather than a compare and branch. I won't be able to look into why that happens anytime soon, but I can file a bug if you think that's appropriate.

X86 codegen (once we enable that) is not going to generate a compare for any of these testcases from what I can tell.

One other comment I'd like to point out here regarding the PPC codegen: I was expecting an 'fsel' to be generated rather than a compare and branch. I won't be able to look into why that happens anytime soon, but I can file a bug if you think that's appropriate.

Yes, please file a PR and we'll look at it; thanks!

Closed by commit rL219445 (authored by @spatel).

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

33 lines

test/

CodeGen/

PowerPC/

recipest.ll

11 lines

Diff 14678

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,082 Lines • ▼ Show 20 Lines	SDValue DAGCombiner::visitFREM(SDNode *N) {
if (N0CFP && N1CFP)		if (N0CFP && N1CFP)
return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1);		return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1);

return SDValue();		return SDValue();
}		}

SDValue DAGCombiner::visitFSQRT(SDNode *N) {		SDValue DAGCombiner::visitFSQRT(SDNode *N) {
if (DAG.getTarget().Options.UnsafeFPMath) {		if (DAG.getTarget().Options.UnsafeFPMath) {
// Compute this as 1/(1/sqrt(X)): the reciprocal of the reciprocal sqrt.		// Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) {		if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) {
AddToWorklist(RV.getNode());		AddToWorklist(RV.getNode());
RV = BuildReciprocalEstimate(RV);
if (RV.getNode()) {
// Unfortunately, RV is now NaN if the input was exactly 0.
// Select out this case and force the answer to 0.
EVT VT = RV.getValueType();		EVT VT = RV.getValueType();
		RV = DAG.getNode(ISD::FMUL, SDLoc(N), VT, N->getOperand(0), RV);
		AddToWorklist(RV.getNode());

		// Unfortunately, RV is now NaN if the input was exactly 0.
		// Select out this case and force the answer to 0.
SDValue Zero = DAG.getConstantFP(0.0, VT);		SDValue Zero = DAG.getConstantFP(0.0, VT);
SDValue ZeroCmp =		SDValue ZeroCmp =
DAG.getSetCC(SDLoc(N), TLI.getSetCCResultType(*DAG.getContext(), VT),		DAG.getSetCC(SDLoc(N), TLI.getSetCCResultType(*DAG.getContext(), VT),
N->getOperand(0), Zero, ISD::SETEQ);		N->getOperand(0), Zero, ISD::SETEQ);
AddToWorklist(ZeroCmp.getNode());		AddToWorklist(ZeroCmp.getNode());
AddToWorklist(RV.getNode());		AddToWorklist(RV.getNode());

RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT,		RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT,
SDLoc(N), VT, ZeroCmp, Zero, RV);		SDLoc(N), VT, ZeroCmp, Zero, RV);
return RV;		return RV;
}		}
}		}
}
return SDValue();		return SDValue();
}		}

SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {		SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
SDValue N0 = N->getOperand(0);		SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);		SDValue N1 = N->getOperand(1);
ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);		ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);		ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
▲ Show 20 Lines • Show All 5,144 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/PowerPC/recipest.ll

	Show First 20 Lines • Show All 191 Lines • ▼ Show 20 Lines
	; CHECK-DAG: frsqrte			; CHECK-DAG: frsqrte
	; CHECK-DAG: fnmsub			; CHECK-DAG: fnmsub
	; CHECK: fmul			; CHECK: fmul
	; CHECK-NEXT: fmadd			; CHECK-NEXT: fmadd
	; CHECK-NEXT: fmul			; CHECK-NEXT: fmul
	; CHECK-NEXT: fmul			; CHECK-NEXT: fmul
	; CHECK-NEXT: fmadd			; CHECK-NEXT: fmadd
	; CHECK-NEXT: fmul			; CHECK-NEXT: fmul
	; CHECK-NEXT: fre			; CHECK-NEXT: fmul
	; CHECK-NEXT: fnmsub
	; CHECK-NEXT: fmadd
	; CHECK-NEXT: fnmsub
	; CHECK-NEXT: fmadd
	; CHECK: blr			; CHECK: blr

	; CHECK-SAFE: @foo3			; CHECK-SAFE: @foo3
	; CHECK-SAFE: fsqrt			; CHECK-SAFE: fsqrt
	; CHECK-SAFE: blr			; CHECK-SAFE: blr
	}			}

	define float @goo3(float %a) nounwind {			define float @goo3(float %a) nounwind {
	%r = call float @llvm.sqrt.f32(float %a)			%r = call float @llvm.sqrt.f32(float %a)
	ret float %r			ret float %r

	; CHECK: @goo3			; CHECK: @goo3
	; CHECK: fcmpu			; CHECK: fcmpu
	; CHECK-DAG: frsqrtes			; CHECK-DAG: frsqrtes
	; CHECK-DAG: fnmsubs			; CHECK-DAG: fnmsubs
	; CHECK: fmuls			; CHECK: fmuls
	; CHECK-NEXT: fmadds			; CHECK-NEXT: fmadds
	; CHECK-NEXT: fmuls			; CHECK-NEXT: fmuls
	; CHECK-NEXT: fres			; CHECK-NEXT: fmuls
	; CHECK-NEXT: fnmsubs
	; CHECK-NEXT: fmadds
	; CHECK: blr			; CHECK: blr

	; CHECK-SAFE: @goo3			; CHECK-SAFE: @goo3
	; CHECK-SAFE: fsqrts			; CHECK-SAFE: fsqrts
	; CHECK-SAFE: blr			; CHECK-SAFE: blr
	}			}

	define <4 x float> @hoo3(<4 x float> %a) nounwind {			define <4 x float> @hoo3(<4 x float> %a) nounwind {
	%r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)			%r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
	ret <4 x float> %r			ret <4 x float> %r

	; CHECK: @hoo3			; CHECK: @hoo3
	; CHECK: vrsqrtefp			; CHECK: vrsqrtefp
	; CHECK-DAG: vrefp
	; CHECK-DAG: vcmpeqfp			; CHECK-DAG: vcmpeqfp

	; CHECK-SAFE: @hoo3			; CHECK-SAFE: @hoo3
	; CHECK-SAFE-NOT: vrsqrtefp			; CHECK-SAFE-NOT: vrsqrtefp
	; CHECK-SAFE: blr			; CHECK-SAFE: blr
	}			}