Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -182,6 +182,8 @@ "LEA instruction with certain arguments is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; +def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", + "Use RSQRT* to optimize square root calculations">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -347,7 +349,8 @@ [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, - FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>; + FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD, + FeatureUseSqrtEst]>; // Bulldozer def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -1014,6 +1014,10 @@ /// Convert a comparison if required by the subtarget. SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; + + /// Use rsqrt* to speed up sqrt calculations. + SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; }; namespace X86 { Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -14335,6 +14335,59 @@ return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +/// Override the default NR algorithm because the 2-constant implementation +/// runs faster on Intel SandyBridge and AMD Jaguar (btver2). It has one +/// less FP instruction in exchange for an extra constant load that should +/// be folded into an FP op. +SDValue X86TargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // sig digs in the divisor and/or sqrt operand. + if (!Subtarget->useSqrtEst()) + return SDValue(); + + EVT VT = Operand.getValueType(); + + // SSE1 has rsqrtss and rsqrtps. + // TODO: Add support for AVX (v8f32) and AVX512 (v16f32). + // TODO: Is it ever worthwhile to use an estimate for f64? + if (Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) { + // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) + // For the reciprocal sqrt, we need to find the zero of the function: + // F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] + // => + // X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0)) + SDLoc DL(Operand); + SDValue MinusThree = DCI.DAG.getConstantFP(-3.0, VT); + SDValue MinusHalf = DCI.DAG.getConstantFP(-0.5, VT); + + SDValue Est = DCI.DAG.getNode(X86ISD::FRSQRT, DL, VT, Operand); + SDValue HalfEst = DCI.DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf); + DCI.AddToWorklist(HalfEst.getNode()); + + Est = DCI.DAG.getNode(ISD::FMUL, DL, VT, Est, Est); + DCI.AddToWorklist(Est.getNode()); + + Est = DCI.DAG.getNode(ISD::FMUL, DL, VT, Est, Operand); + DCI.AddToWorklist(Est.getNode()); + + Est = DCI.DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree); + DCI.AddToWorklist(Est.getNode()); + + Est = DCI.DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst); + DCI.AddToWorklist(Est.getNode()); + + RefinementSteps = 0; + return Est; + } + return SDValue(); +} + static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast(V); return C && C->isAllOnesValue(); Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -192,6 +192,11 @@ /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; + /// Use the RSQRT* instructions to optimize square root calculations. + /// For this to be profitable, the cost of FSQRT and FDIV must be + /// substantially higher than normal FP ops like FADD and FMUL. + bool UseSqrtEst; + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -369,6 +374,7 @@ bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } + bool useSqrtEst() const { return UseSqrtEst; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -278,6 +278,7 @@ LEAUsesAG = false; SlowLEA = false; SlowIncDec = false; + UseSqrtEst = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; Index: test/CodeGen/X86/sqrt-fastmath.ll =================================================================== --- test/CodeGen/X86/sqrt-fastmath.ll +++ test/CodeGen/X86/sqrt-fastmath.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mcpu=core2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 ; generated using "clang -S -O2 -ffast-math -emit-llvm sqrt.c" from ; #include @@ -52,9 +53,59 @@ ret x86_fp80 %call } -; Function Attrs: nounwind readnone declare x86_fp80 @__sqrtl_finite(x86_fp80) #1 +; If the target's sqrtss and divss instructions are substantially +; slower than rsqrtss with a Newton-Raphson refinement, we should +; generate the estimate sequence. +define float @reciprocal_square_root(float %x) #0 { + %sqrt = tail call float @llvm.sqrt.f32(float %x) + %div = fdiv fast float 1.0, %sqrt + ret float %div + +; CHECK-LABEL: reciprocal_square_root: +; CHECK: sqrtss +; CHECK-NEXT: movss +; CHECK-NEXT: divss +; CHECK-NEXT: retq +; BTVER2-LABEL: reciprocal_square_root: +; BTVER2: vrsqrtss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vaddss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: retq +} + +declare float @llvm.sqrt.f32(float) #1 + +; If the target's sqrtps and divps instructions are substantially +; slower than rsqrtps with a Newton-Raphson refinement, we should +; generate the estimate sequence. +define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 { + %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) + %div = fdiv fast <4 x float> , %sqrt + ret <4 x float> %div + +; CHECK-LABEL: reciprocal_square_root_v4f32: +; CHECK: sqrtps +; CHECK-NEXT: movaps +; CHECK-NEXT: divps +; CHECK-NEXT: retq +; BTVER2-LABEL: reciprocal_square_root_v4f32: +; BTVER2: vrsqrtps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vaddps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: retq +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1 + + attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #2 = { nounwind readnone }