Index: llvm/trunk/lib/Target/X86/X86.td =================================================================== --- llvm/trunk/lib/Target/X86/X86.td +++ llvm/trunk/lib/Target/X86/X86.td @@ -184,6 +184,8 @@ "INC and DEC instructions are slower than ADD and SUB">; def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", "Use RSQRT* to optimize square root calculations">; +def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst", + "true", "Use RCP* to optimize division calculations">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -350,7 +352,7 @@ FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD, - FeatureUseSqrtEst]>; + FeatureUseSqrtEst, FeatureUseRecipEst]>; // Bulldozer def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -1031,6 +1031,10 @@ SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps, bool &UseOneConstNR) const override; + + /// Use rcp* to speed up fdiv calculations. + SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; }; namespace X86 { Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -14514,6 +14514,37 @@ return SDValue(); } +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRecipEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor. + if (!Subtarget->useReciprocalEst()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. + // TODO: Add support for AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // reciprocal estimate with refinement on x86 prior to FMA requires + // 15 instructions: convert to single, rcpss, convert back to double, refine + // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasAVX() && VT == MVT::v8f32)) { + // TODO: Expose this as a user-configurable parameter to allow for + // speed vs. accuracy flexibility. + RefinementSteps = 1; + return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); + } + return SDValue(); +} + static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast(V); return C && C->isAllOnesValue(); Index: llvm/trunk/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.h +++ llvm/trunk/lib/Target/X86/X86Subtarget.h @@ -197,6 +197,11 @@ /// substantially higher than normal FP ops like FADD and FMUL. bool UseSqrtEst; + /// Use the RCP* instructions to optimize FP division calculations. + /// For this to be profitable, the cost of FDIV must be + /// substantially higher than normal FP ops like FADD and FMUL. + bool UseReciprocalEst; + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -375,6 +380,7 @@ bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } bool useSqrtEst() const { return UseSqrtEst; } + bool useReciprocalEst() const { return UseReciprocalEst; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } Index: llvm/trunk/test/CodeGen/X86/recip-fastmath.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/recip-fastmath.ll +++ llvm/trunk/test/CodeGen/X86/recip-fastmath.ll @@ -0,0 +1,72 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 + +; If the target's divss/divps instructions are substantially +; slower than rcpss/rcpps with a Newton-Raphson refinement, +; we should generate the estimate sequence. + +; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 ) +; for details about the accuracy, speed, and implementation +; differences of x86 reciprocal estimates. + +define float @reciprocal_estimate(float %x) #0 { + %div = fdiv fast float 1.0, %x + ret float %div + +; CHECK-LABEL: reciprocal_estimate: +; CHECK: movss +; CHECK-NEXT: divss +; CHECK-NEXT: movaps +; CHECK-NEXT: retq + +; BTVER2-LABEL: reciprocal_estimate: +; BTVER2: vrcpss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vsubss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vaddss +; BTVER2-NEXT: retq +} + +define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 { + %div = fdiv fast <4 x float> , %x + ret <4 x float> %div + +; CHECK-LABEL: reciprocal_estimate_v4f32: +; CHECK: movaps +; CHECK-NEXT: divps +; CHECK-NEXT: movaps +; CHECK-NEXT: retq + +; BTVER2-LABEL: reciprocal_estimate_v4f32: +; BTVER2: vrcpps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vsubps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vaddps +; BTVER2-NEXT: retq +} + +define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 { + %div = fdiv fast <8 x float> , %x + ret <8 x float> %div + +; CHECK-LABEL: reciprocal_estimate_v8f32: +; CHECK: movaps +; CHECK: movaps +; CHECK-NEXT: divps +; CHECK-NEXT: divps +; CHECK-NEXT: movaps +; CHECK-NEXT: movaps +; CHECK-NEXT: retq + +; BTVER2-LABEL: reciprocal_estimate_v8f32: +; BTVER2: vrcpps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vsubps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vaddps +; BTVER2-NEXT: retq +} + +attributes #0 = { "unsafe-fp-math"="true" }