Index: include/llvm/CodeGen/CommandFlags.h =================================================================== --- include/llvm/CodeGen/CommandFlags.h +++ include/llvm/CodeGen/CommandFlags.h @@ -24,6 +24,7 @@ #include "llvm/Support/Host.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRecip.h" #include using namespace llvm; @@ -152,6 +153,12 @@ "Only fuse FP ops when the result won't be effected."), clEnumValEnd)); +cl::list +ReciprocalOps("recip", + cl::CommaSeparated, + cl::desc("Choose reciprocal operation types and parameters."), + cl::value_desc("all,none,default,divf,vec-sqrtd,vec-divd:0,sqrtf:9...")); + cl::opt DontPlaceZerosInBSS("nozero-initialized-in-bss", cl::desc("Don't place zero-initialized symbols into bss section"), @@ -231,6 +238,7 @@ Options.LessPreciseFPMADOption = EnableFPMAD; Options.NoFramePointerElim = DisableFPElim; Options.AllowFPOpFusion = FuseFPOps; + Options.Reciprocals = ReciprocalOps; Options.UnsafeFPMath = EnableUnsafeFPMath; Options.NoInfsFPMath = EnableNoInfsFPMath; Options.NoNaNsFPMath = EnableNoNaNsFPMath; Index: include/llvm/Target/TargetOptions.h =================================================================== --- include/llvm/Target/TargetOptions.h +++ include/llvm/Target/TargetOptions.h @@ -15,6 +15,7 @@ #ifndef LLVM_TARGET_TARGETOPTIONS_H #define LLVM_TARGET_TARGETOPTIONS_H +#include "llvm/Target/TargetRecip.h" #include "llvm/MC/MCTargetOptions.h" #include @@ -72,7 +73,8 @@ CompressDebugSections(false), FunctionSections(false), DataSections(false), UniqueSectionNames(true), TrapUnreachable(false), TrapFuncName(), FloatABIType(FloatABI::Default), - AllowFPOpFusion(FPOpFusion::Standard), JTType(JumpTable::Single), + AllowFPOpFusion(FPOpFusion::Standard), Reciprocals(), + JTType(JumpTable::Single), ThreadModel(ThreadModel::POSIX) {} /// PrintMachineCode - This flag is enabled when the -print-machineinstrs @@ -211,6 +213,9 @@ /// the value of this option. FPOpFusion::FPOpFusionMode AllowFPOpFusion; + /// This class encapsulates options for reciprocal-estimate code generation. + TargetRecip Reciprocals; + /// JTType - This flag specifies the type of jump-instruction table to /// create for functions that have the jumptable attribute. JumpTable::JumpTableType JTType; @@ -245,6 +250,7 @@ ARE_EQUAL(TrapFuncName) && ARE_EQUAL(FloatABIType) && ARE_EQUAL(AllowFPOpFusion) && + ARE_EQUAL(Reciprocals) && ARE_EQUAL(JTType) && ARE_EQUAL(ThreadModel) && ARE_EQUAL(MCOptions); Index: include/llvm/Target/TargetRecip.h =================================================================== --- include/llvm/Target/TargetRecip.h +++ include/llvm/Target/TargetRecip.h @@ -0,0 +1,74 @@ +//===--------------------- llvm/Target/TargetRecip.h ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class is used to customize machine-specific reciprocal estimate code +// generation in a target-independent way. +// If a target does not support operations in this specification, then code +// generation will default to using supported operations. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_TARGETRECIP_H +#define LLVM_TARGET_TARGETRECIP_H + +#include +#include + +namespace llvm { + +enum RecipOps { + RO_DivF = 0, // division, float, scalar + RO_VecDivF, // division, float, vector + RO_DivD, // division, double, scalar + RO_VecDivD, // division, double, vector + RO_SqrtF, // square root, float, scalar + RO_VecSqrtF, // square root, float, vector + RO_SqrtD, // square root, double, scalar + RO_VecSqrtD, // square root, double, vector + RO_All, + RO_NUM_RECIP_OPS = RO_All +}; + +class TargetRecip { +public: + TargetRecip(); + + /// Initialize all or part of the operations from command-line options or + /// encoded strings. + TargetRecip(const std::vector &Args); + + virtual ~TargetRecip(); + + /// Set whether a particular reciprocal operation is enabled and how many + /// refinement steps are needed when using it. Use the 'RO_All' value + /// to set enablement and refinement steps for all operations. + void setDefaults(RecipOps Op, bool Enable, unsigned RefSteps); + + /// Return true if the reciprocal operation has been enabled by default or + /// from the command-line. Return false if the operations has been disabled + /// by default or from the command-line. + bool isEnabled(RecipOps Op) const; + + /// Return the number of iterations necessary to refine the + /// the result of a machine instruction for the given reciprocal operation. + unsigned getRefinementSteps(RecipOps Op) const; + + bool operator==(const TargetRecip &Other) const; + +private: + int8_t Enabled[RO_NUM_RECIP_OPS]; + int8_t RefinementSteps[RO_NUM_RECIP_OPS]; + + bool ParseGlobalParams(const std::string &Arg); + void ParseIndividualParams(const std::string &Arg); +}; + +} // End llvm namespace + +#endif Index: lib/Target/CMakeLists.txt =================================================================== --- lib/Target/CMakeLists.txt +++ lib/Target/CMakeLists.txt @@ -6,6 +6,7 @@ TargetLoweringObjectFile.cpp TargetMachine.cpp TargetMachineC.cpp + TargetRecip.cpp TargetSubtargetInfo.cpp ADDITIONAL_HEADER_DIRS Index: lib/Target/TargetRecip.cpp =================================================================== --- lib/Target/TargetRecip.cpp +++ lib/Target/TargetRecip.cpp @@ -0,0 +1,204 @@ +//===-------------------------- TargetRecip.cpp ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class is used to customize machine-specific reciprocal estimate code +// generation in a target-independent way. +// If a target does not support operations in this specification, then code +// generation will default to using supported operations. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetRecip.h" + +using namespace llvm; + +// These must be in the same order as the corresponding enum values. +const char *ArgStrings[] = { + "divf", + "vec-divf", + "divd", + "vec-divd", + "sqrtf", + "vec-sqrtf", + "sqrtd", + "vec-sqrtd" +}; + +// The uninitialized state is needed for the enablement bits and refinement +// steps because custom settings may arrive via the command-line before target +// defaults are set. +enum { + Uninitialized = -1 +}; + +TargetRecip::TargetRecip() { + for (int i = 0; i != RO_NUM_RECIP_OPS; ++i) { + Enabled[i] = Uninitialized; + RefinementSteps[i] = Uninitialized; + } +} + +static bool ParseRefinementStep(const std::string &In, size_t &Position, + uint8_t &Value) { + const char REF_STEP_TOKEN = ':'; + Position = In.find(REF_STEP_TOKEN); + if (Position == std::string::npos) + return false; + + std::string RefStepString = In.substr(Position + 1); + // Allow exactly one numeric character for the additional refinement + // step parameter. + if (RefStepString.length() == 1) { + char RefStepChar = RefStepString[0]; + if (RefStepChar >= '0' && RefStepChar <= '9') { + Value = RefStepChar - '0'; + return true; + } + } + report_fatal_error("Invalid refinement step for -recip."); +} + +bool TargetRecip::ParseGlobalParams(const std::string &Arg) { + bool Enable; + bool UseDefaults; + if (Arg.find("all") == 0) { + UseDefaults = false; + Enable = true; + } else if (Arg.find("none") == 0) { + UseDefaults = false; + Enable = false; + } else if (Arg.find("default") == 0) { + UseDefaults = true; + } else { + // Any other string is invalid or an individual setting. + return false; + } + + // All enable values will be initialized to target defaults if 'default' was + // specified. + if (!UseDefaults) + for (int i = 0; i != RO_NUM_RECIP_OPS; ++i) + Enabled[i] = Enable; + + size_t RefPos; + uint8_t RefSteps; + if (ParseRefinementStep(Arg, RefPos, RefSteps)) { + // Custom refinement count was specified with all, none, or default. + for (int i = 0; i != RO_NUM_RECIP_OPS; ++i) + RefinementSteps[i] = RefSteps; + } + return true; +} + +void TargetRecip::ParseIndividualParams(const std::string &Arg) { + std::string ArgSub = Arg; + + // Each reciprocal type may be enabled ('+') or disabled ('-') individually. + bool IsEnabled; + if (Arg[0] == '+') { + ArgSub = Arg.substr(1); + IsEnabled = true; + } else if (Arg[0] == '-') { + ArgSub = Arg.substr(1); + IsEnabled = false; + } else { + // If no plus or minus, default to plus. + IsEnabled = true; + } + + // Look for an optional setting of the number of refinement steps needed + // for this type of reciprocal operation. + size_t RefPos; + uint8_t RefSteps; + std::string RefStepString; + if (ParseRefinementStep(ArgSub, RefPos, RefSteps)) { + // Split the string for further processing. + RefStepString = ArgSub.substr(RefPos + 1); + ArgSub = ArgSub.substr(0, RefPos); + } + + // Find the reciprocal operation corresponding to this string value. + RecipOps Op = RO_NUM_RECIP_OPS; + for (int i = 0; i != RO_NUM_RECIP_OPS; ++i) { + if (ArgSub == ArgStrings[i]) { + Op = (RecipOps)i; + break; + } + } + + if (Op == RO_NUM_RECIP_OPS) + report_fatal_error("Invalid option for -recip."); + + + // Set whether this operation is being enabled or disabled, and optionally + // set the number of refinement steps for this operation. + Enabled[Op] = IsEnabled; + if (!RefStepString.empty()) { + RefinementSteps[Op] = RefSteps; + } +} + +TargetRecip::TargetRecip(const std::vector &Args) : + TargetRecip::TargetRecip() { + unsigned NumArgs = Args.size(); + + // Check if "all", "default", or "none" was specified. + if (NumArgs == 1 && ParseGlobalParams(Args[0])) + return; + + for (unsigned i = 0; i != NumArgs; ++i) { + std::string Value = Args[i]; + if (Value.empty()) + report_fatal_error("Empty option string for -recip."); + ParseIndividualParams(Value); + } +} + +bool TargetRecip::isEnabled(RecipOps Op) const { + if (Op == RO_NUM_RECIP_OPS) return false; + assert(Enabled[Op] != Uninitialized && "Enabled setting was not initialized"); + return Enabled[Op]; +} + +unsigned TargetRecip::getRefinementSteps(RecipOps Op) const { + if (Op == RO_NUM_RECIP_OPS) return 0; + assert(RefinementSteps[Op] != Uninitialized && + "Refinement step setting was not initialized"); + return RefinementSteps[Op]; +} + +void TargetRecip::setDefaults(RecipOps Op, bool Enable, unsigned RefSteps) { + if (Op == RO_All) { + for (int i = 0; i != RO_NUM_RECIP_OPS; ++i) { + if (Enabled[i] == Uninitialized) + Enabled[i] = Enable; + if (RefinementSteps[i] == Uninitialized) + RefinementSteps[i] = RefSteps; + } + } else { + if (Enabled[Op] == Uninitialized) + Enabled[Op] = Enable; + if (RefinementSteps[Op] == Uninitialized) + RefinementSteps[Op] = RefSteps; + } +} + +bool TargetRecip::operator==(const TargetRecip &Other) const { + for (int i = 0; i != RO_NUM_RECIP_OPS; ++i) { + if (RefinementSteps[i] != Other.RefinementSteps[i]) + return false; + if (Enabled[i] != Other.Enabled[i]) + return false; + } + return true; +} + +TargetRecip::~TargetRecip() { +} Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -188,10 +188,6 @@ "LEA instruction with certain arguments is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; -def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", - "Use RSQRT* to optimize square root calculations">; -def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst", - "true", "Use RCP* to optimize division calculations">; def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software floating point features.">; @@ -444,7 +440,7 @@ FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem, - FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>; + FeatureSlowSHLD]>; // TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -67,12 +67,6 @@ "rather than promotion."), cl::Hidden); -static cl::opt ReciprocalEstimateRefinementSteps( - "x86-recip-refinement-steps", cl::init(1), - cl::desc("Specify the number of Newton-Raphson iterations applied to the " - "result of the hardware reciprocal estimate instruction."), - cl::NotHidden); - // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -12874,29 +12868,31 @@ DAGCombinerInfo &DCI, unsigned &RefinementSteps, bool &UseOneConstNR) const { - // FIXME: We should use instruction latency models to calculate the cost of - // each potential sequence, but this is very hard to do reliably because - // at least Intel's Core* chips have variable timing based on the number of - // significant digits in the divisor and/or sqrt operand. - if (!Subtarget->useSqrtEst()) - return SDValue(); - EVT VT = Op.getValueType(); + RecipOps RecipOp; - // SSE1 has rsqrtss and rsqrtps. + // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget->hasAVX() && VT == MVT::v8f32)) { - RefinementSteps = 1; - UseOneConstNR = false; - return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); - } - return SDValue(); + if (VT == MVT::f32 && Subtarget->hasSSE1()) + RecipOp = RO_SqrtF; + else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || + (VT == MVT::v8f32 && Subtarget->hasAVX())) + RecipOp = RO_VecSqrtF; + else + return SDValue(); + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); } /// The minimum architected relative accuracy is 2^-12. We need one @@ -12904,15 +12900,9 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const { - // FIXME: We should use instruction latency models to calculate the cost of - // each potential sequence, but this is very hard to do reliably because - // at least Intel's Core* chips have variable timing based on the number of - // significant digits in the divisor. - if (!Subtarget->useReciprocalEst()) - return SDValue(); - EVT VT = Op.getValueType(); - + RecipOps RecipOp; + // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision @@ -12920,12 +12910,20 @@ // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget->hasAVX() && VT == MVT::v8f32)) { - RefinementSteps = ReciprocalEstimateRefinementSteps; - return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); - } - return SDValue(); + if (VT == MVT::f32 && Subtarget->hasSSE1()) + RecipOp = RO_DivF; + else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || + (VT == MVT::v8f32 && Subtarget->hasAVX())) + RecipOp = RO_VecDivF; + else + return SDValue(); + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); + return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); } /// If we have at least two divisions that use the same divisor, convert to Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -190,16 +190,6 @@ /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; - /// Use the RSQRT* instructions to optimize square root calculations. - /// For this to be profitable, the cost of FSQRT and FDIV must be - /// substantially higher than normal FP ops like FADD and FMUL. - bool UseSqrtEst; - - /// Use the RCP* instructions to optimize FP division calculations. - /// For this to be profitable, the cost of FDIV must be - /// substantially higher than normal FP ops like FADD and FMUL. - bool UseReciprocalEst; - /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -377,8 +367,6 @@ bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } - bool useSqrtEst() const { return UseSqrtEst; } - bool useReciprocalEst() const { return UseReciprocalEst; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -273,8 +273,6 @@ LEAUsesAG = false; SlowLEA = false; SlowIncDec = false; - UseSqrtEst = false; - UseReciprocalEst = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -109,6 +109,7 @@ if (Subtarget.isTargetWin64()) this->Options.TrapUnreachable = true; + this->Options.Reciprocals.setDefaults(RO_All, false, 1); initAsmInfo(); } Index: test/CodeGen/X86/recip-fastmath.ll =================================================================== --- test/CodeGen/X86/recip-fastmath.ll +++ test/CodeGen/X86/recip-fastmath.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est | FileCheck %s --check-prefix=RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf,vec-divf | FileCheck %s --check-prefix=RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf:2,vec-divf:2 | FileCheck %s --check-prefix=REFINE ; If the target's divss/divps instructions are substantially ; slower than rcpss/rcpps with a Newton-Raphson refinement, Index: test/CodeGen/X86/sqrt-fastmath.ll =================================================================== --- test/CodeGen/X86/sqrt-fastmath.ll +++ test/CodeGen/X86/sqrt-fastmath.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-sqrt-est | FileCheck %s --check-prefix=ESTIMATE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=sqrtf,vec-sqrtf | FileCheck %s --check-prefix=ESTIMATE declare double @__sqrt_finite(double) #0 declare float @__sqrtf_finite(float) #0