Index: llvm/trunk/include/llvm/CodeGen/CommandFlags.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/CommandFlags.h +++ llvm/trunk/include/llvm/CodeGen/CommandFlags.h @@ -24,6 +24,7 @@ #include "llvm/Support/Host.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRecip.h" #include using namespace llvm; @@ -152,6 +153,12 @@ "Only fuse FP ops when the result won't be effected."), clEnumValEnd)); +cl::list +ReciprocalOps("recip", + cl::CommaSeparated, + cl::desc("Choose reciprocal operation types and parameters."), + cl::value_desc("all,none,default,divf,!vec-sqrtd,vec-divd:0,sqrt:9...")); + cl::opt DontPlaceZerosInBSS("nozero-initialized-in-bss", cl::desc("Don't place zero-initialized symbols into bss section"), @@ -231,6 +238,7 @@ Options.LessPreciseFPMADOption = EnableFPMAD; Options.NoFramePointerElim = DisableFPElim; Options.AllowFPOpFusion = FuseFPOps; + Options.Reciprocals = ReciprocalOps; Options.UnsafeFPMath = EnableUnsafeFPMath; Options.NoInfsFPMath = EnableNoInfsFPMath; Options.NoNaNsFPMath = EnableNoNaNsFPMath; Index: llvm/trunk/include/llvm/Target/TargetOptions.h =================================================================== --- llvm/trunk/include/llvm/Target/TargetOptions.h +++ llvm/trunk/include/llvm/Target/TargetOptions.h @@ -15,6 +15,7 @@ #ifndef LLVM_TARGET_TARGETOPTIONS_H #define LLVM_TARGET_TARGETOPTIONS_H +#include "llvm/Target/TargetRecip.h" #include "llvm/MC/MCTargetOptions.h" #include @@ -71,7 +72,8 @@ CompressDebugSections(false), FunctionSections(false), DataSections(false), UniqueSectionNames(true), TrapUnreachable(false), TrapFuncName(), FloatABIType(FloatABI::Default), - AllowFPOpFusion(FPOpFusion::Standard), JTType(JumpTable::Single), + AllowFPOpFusion(FPOpFusion::Standard), Reciprocals(), + JTType(JumpTable::Single), ThreadModel(ThreadModel::POSIX) {} /// PrintMachineCode - This flag is enabled when the -print-machineinstrs @@ -210,6 +212,9 @@ /// the value of this option. FPOpFusion::FPOpFusionMode AllowFPOpFusion; + /// This class encapsulates options for reciprocal-estimate code generation. + TargetRecip Reciprocals; + /// JTType - This flag specifies the type of jump-instruction table to /// create for functions that have the jumptable attribute. JumpTable::JumpTableType JTType; @@ -244,6 +249,7 @@ ARE_EQUAL(TrapFuncName) && ARE_EQUAL(FloatABIType) && ARE_EQUAL(AllowFPOpFusion) && + ARE_EQUAL(Reciprocals) && ARE_EQUAL(JTType) && ARE_EQUAL(ThreadModel) && ARE_EQUAL(MCOptions); Index: llvm/trunk/include/llvm/Target/TargetRecip.h =================================================================== --- llvm/trunk/include/llvm/Target/TargetRecip.h +++ llvm/trunk/include/llvm/Target/TargetRecip.h @@ -0,0 +1,72 @@ +//===--------------------- llvm/Target/TargetRecip.h ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class is used to customize machine-specific reciprocal estimate code +// generation in a target-independent way. +// If a target does not support operations in this specification, then code +// generation will default to using supported operations. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_TARGETRECIP_H +#define LLVM_TARGET_TARGETRECIP_H + +#include +#include +#include + +namespace llvm { + +struct TargetRecip { +public: + TargetRecip(); + + /// Initialize all or part of the operations from command-line options or + /// a front end. + TargetRecip(const std::vector &Args); + + /// Set whether a particular reciprocal operation is enabled and how many + /// refinement steps are needed when using it. Use "all" to set enablement + /// and refinement steps for all operations. + void setDefaults(const StringRef &Key, bool Enable, unsigned RefSteps); + + /// Return true if the reciprocal operation has been enabled by default or + /// from the command-line. Return false if the operation has been disabled + /// by default or from the command-line. + bool isEnabled(const StringRef &Key) const; + + /// Return the number of iterations necessary to refine the + /// the result of a machine instruction for the given reciprocal operation. + unsigned getRefinementSteps(const StringRef &Key) const; + + bool operator==(const TargetRecip &Other) const; + +private: + enum { + Uninitialized = -1 + }; + + struct RecipParams { + int8_t Enabled; + int8_t RefinementSteps; + + RecipParams() : Enabled(Uninitialized), RefinementSteps(Uninitialized) {} + }; + + std::map RecipMap; + typedef std::map::iterator RecipIter; + typedef std::map::const_iterator ConstRecipIter; + + bool parseGlobalParams(const std::string &Arg); + void parseIndividualParams(const std::vector &Args); +}; + +} // End llvm namespace + +#endif Index: llvm/trunk/lib/Target/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/CMakeLists.txt +++ llvm/trunk/lib/Target/CMakeLists.txt @@ -6,6 +6,7 @@ TargetLoweringObjectFile.cpp TargetMachine.cpp TargetMachineC.cpp + TargetRecip.cpp TargetSubtargetInfo.cpp ADDITIONAL_HEADER_DIRS Index: llvm/trunk/lib/Target/TargetRecip.cpp =================================================================== --- llvm/trunk/lib/Target/TargetRecip.cpp +++ llvm/trunk/lib/Target/TargetRecip.cpp @@ -0,0 +1,225 @@ +//===-------------------------- TargetRecip.cpp ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class is used to customize machine-specific reciprocal estimate code +// generation in a target-independent way. +// If a target does not support operations in this specification, then code +// generation will default to using supported operations. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetRecip.h" +#include + +using namespace llvm; + +// These are the names of the individual reciprocal operations. These are +// the key strings for queries and command-line inputs. +// In addition, the command-line interface recognizes the global parameters +// "all", "none", and "default". +static const char *RecipOps[] = { + "divd", + "divf", + "vec-divd", + "vec-divf", + "sqrtd", + "sqrtf", + "vec-sqrtd", + "vec-sqrtf", +}; + +// The uninitialized state is needed for the enabled settings and refinement +// steps because custom settings may arrive via the command-line before target +// defaults are set. +TargetRecip::TargetRecip() { + unsigned NumStrings = llvm::array_lengthof(RecipOps); + for (unsigned i = 0; i < NumStrings; ++i) + RecipMap.insert(std::make_pair(RecipOps[i], RecipParams())); +} + +static bool parseRefinementStep(const StringRef &In, size_t &Position, + uint8_t &Value) { + const char RefStepToken = ':'; + Position = In.find(RefStepToken); + if (Position == StringRef::npos) + return false; + + StringRef RefStepString = In.substr(Position + 1); + // Allow exactly one numeric character for the additional refinement + // step parameter. + if (RefStepString.size() == 1) { + char RefStepChar = RefStepString[0]; + if (RefStepChar >= '0' && RefStepChar <= '9') { + Value = RefStepChar - '0'; + return true; + } + } + report_fatal_error("Invalid refinement step for -recip."); +} + +bool TargetRecip::parseGlobalParams(const std::string &Arg) { + StringRef ArgSub = Arg; + + // Look for an optional setting of the number of refinement steps needed + // for this type of reciprocal operation. + size_t RefPos; + uint8_t RefSteps; + StringRef RefStepString; + if (parseRefinementStep(ArgSub, RefPos, RefSteps)) { + // Split the string for further processing. + RefStepString = ArgSub.substr(RefPos + 1); + ArgSub = ArgSub.substr(0, RefPos); + } + bool Enable; + bool UseDefaults; + if (ArgSub == "all") { + UseDefaults = false; + Enable = true; + } else if (ArgSub == "none") { + UseDefaults = false; + Enable = false; + } else if (ArgSub == "default") { + UseDefaults = true; + } else { + // Any other string is invalid or an individual setting. + return false; + } + + // All enable values will be initialized to target defaults if 'default' was + // specified. + if (!UseDefaults) + for (auto &KV : RecipMap) + KV.second.Enabled = Enable; + + // Custom refinement count was specified with all, none, or default. + if (!RefStepString.empty()) + for (auto &KV : RecipMap) + KV.second.RefinementSteps = RefSteps; + + return true; +} + +void TargetRecip::parseIndividualParams(const std::vector &Args) { + static const char DisabledPrefix = '!'; + unsigned NumArgs = Args.size(); + + for (unsigned i = 0; i != NumArgs; ++i) { + StringRef Val = Args[i]; + + bool IsDisabled = Val[0] == DisabledPrefix; + // Ignore the disablement token for string matching. + if (IsDisabled) + Val = Val.substr(1); + + size_t RefPos; + uint8_t RefSteps; + StringRef RefStepString; + if (parseRefinementStep(Val, RefPos, RefSteps)) { + // Split the string for further processing. + RefStepString = Val.substr(RefPos + 1); + Val = Val.substr(0, RefPos); + } + + RecipIter Iter = RecipMap.find(Val); + if (Iter == RecipMap.end()) { + // Try again specifying float suffix. + Iter = RecipMap.find(Val.str() + 'f'); + if (Iter == RecipMap.end()) { + Iter = RecipMap.find(Val.str() + 'd'); + assert(Iter == RecipMap.end() && "Float entry missing from map"); + report_fatal_error("Invalid option for -recip."); + } + + // The option was specified without a float or double suffix. + if (RecipMap[Val.str() + 'd'].Enabled != Uninitialized) { + // Make sure that the double entry was not already specified. + // The float entry will be checked below. + report_fatal_error("Duplicate option for -recip."); + } + } + + if (Iter->second.Enabled != Uninitialized) + report_fatal_error("Duplicate option for -recip."); + + // Mark the matched option as found. Do not allow duplicate specifiers. + Iter->second.Enabled = !IsDisabled; + if (!RefStepString.empty()) + Iter->second.RefinementSteps = RefSteps; + + // If the precision was not specified, the double entry is also initialized. + if (Val.back() != 'f' && Val.back() != 'd') { + RecipMap[Val.str() + 'd'].Enabled = !IsDisabled; + if (!RefStepString.empty()) + RecipMap[Val.str() + 'd'].RefinementSteps = RefSteps; + } + } +} + +TargetRecip::TargetRecip(const std::vector &Args) : + TargetRecip() { + unsigned NumArgs = Args.size(); + + // Check if "all", "default", or "none" was specified. + if (NumArgs == 1 && parseGlobalParams(Args[0])) + return; + + parseIndividualParams(Args); +} + +bool TargetRecip::isEnabled(const StringRef &Key) const { + ConstRecipIter Iter = RecipMap.find(Key); + assert(Iter != RecipMap.end() && "Unknown name for reciprocal map"); + assert(Iter->second.Enabled != Uninitialized && + "Enablement setting was not initialized"); + return Iter->second.Enabled; +} + +unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const { + ConstRecipIter Iter = RecipMap.find(Key); + assert(Iter != RecipMap.end() && "Unknown name for reciprocal map"); + assert(Iter->second.RefinementSteps != Uninitialized && + "Refinement step setting was not initialized"); + return Iter->second.RefinementSteps; +} + +/// Custom settings (previously initialized values) override target defaults. +void TargetRecip::setDefaults(const StringRef &Key, bool Enable, + unsigned RefSteps) { + if (Key == "all") { + for (auto &KV : RecipMap) { + RecipParams &RP = KV.second; + if (RP.Enabled == Uninitialized) + RP.Enabled = Enable; + if (RP.RefinementSteps == Uninitialized) + RP.RefinementSteps = RefSteps; + } + } else { + RecipParams &RP = RecipMap[Key]; + if (RP.Enabled == Uninitialized) + RP.Enabled = Enable; + if (RP.RefinementSteps == Uninitialized) + RP.RefinementSteps = RefSteps; + } +} + +bool TargetRecip::operator==(const TargetRecip &Other) const { + for (const auto &KV : RecipMap) { + const StringRef &Op = KV.first; + const RecipParams &RP = KV.second; + const RecipParams &OtherRP = Other.RecipMap.find(Op)->second; + if (RP.RefinementSteps != OtherRP.RefinementSteps) + return false; + if (RP.Enabled != OtherRP.Enabled) + return false; + } + return true; +} Index: llvm/trunk/lib/Target/X86/X86.td =================================================================== --- llvm/trunk/lib/Target/X86/X86.td +++ llvm/trunk/lib/Target/X86/X86.td @@ -188,10 +188,6 @@ "LEA instruction with certain arguments is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; -def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", - "Use RSQRT* to optimize square root calculations">; -def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst", - "true", "Use RCP* to optimize division calculations">; def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software floating point features.">; @@ -444,7 +440,7 @@ FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem, - FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>; + FeatureSlowSHLD]>; // TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips. Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -67,12 +67,6 @@ "rather than promotion."), cl::Hidden); -static cl::opt ReciprocalEstimateRefinementSteps( - "x86-recip-refinement-steps", cl::init(1), - cl::desc("Specify the number of Newton-Raphson iterations applied to the " - "result of the hardware reciprocal estimate instruction."), - cl::NotHidden); - // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -12901,29 +12895,31 @@ DAGCombinerInfo &DCI, unsigned &RefinementSteps, bool &UseOneConstNR) const { - // FIXME: We should use instruction latency models to calculate the cost of - // each potential sequence, but this is very hard to do reliably because - // at least Intel's Core* chips have variable timing based on the number of - // significant digits in the divisor and/or sqrt operand. - if (!Subtarget->useSqrtEst()) - return SDValue(); - EVT VT = Op.getValueType(); + const char *RecipOp; - // SSE1 has rsqrtss and rsqrtps. + // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget->hasAVX() && VT == MVT::v8f32)) { - RefinementSteps = 1; - UseOneConstNR = false; - return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); - } - return SDValue(); + if (VT == MVT::f32 && Subtarget->hasSSE1()) + RecipOp = "sqrtf"; + else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || + (VT == MVT::v8f32 && Subtarget->hasAVX())) + RecipOp = "vec-sqrtf"; + else + return SDValue(); + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); } /// The minimum architected relative accuracy is 2^-12. We need one @@ -12931,15 +12927,9 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const { - // FIXME: We should use instruction latency models to calculate the cost of - // each potential sequence, but this is very hard to do reliably because - // at least Intel's Core* chips have variable timing based on the number of - // significant digits in the divisor. - if (!Subtarget->useReciprocalEst()) - return SDValue(); - EVT VT = Op.getValueType(); - + const char *RecipOp; + // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision @@ -12947,12 +12937,20 @@ // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || - (Subtarget->hasAVX() && VT == MVT::v8f32)) { - RefinementSteps = ReciprocalEstimateRefinementSteps; - return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); - } - return SDValue(); + if (VT == MVT::f32 && Subtarget->hasSSE1()) + RecipOp = "divf"; + else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || + (VT == MVT::v8f32 && Subtarget->hasAVX())) + RecipOp = "vec-divf"; + else + return SDValue(); + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); + return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); } /// If we have at least two divisions that use the same divisor, convert to Index: llvm/trunk/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.h +++ llvm/trunk/lib/Target/X86/X86Subtarget.h @@ -190,16 +190,6 @@ /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; - /// Use the RSQRT* instructions to optimize square root calculations. - /// For this to be profitable, the cost of FSQRT and FDIV must be - /// substantially higher than normal FP ops like FADD and FMUL. - bool UseSqrtEst; - - /// Use the RCP* instructions to optimize FP division calculations. - /// For this to be profitable, the cost of FDIV must be - /// substantially higher than normal FP ops like FADD and FMUL. - bool UseReciprocalEst; - /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -377,8 +367,6 @@ bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } - bool useSqrtEst() const { return UseSqrtEst; } - bool useReciprocalEst() const { return UseReciprocalEst; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } Index: llvm/trunk/lib/Target/X86/X86Subtarget.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.cpp +++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp @@ -273,8 +273,6 @@ LEAUsesAG = false; SlowLEA = false; SlowIncDec = false; - UseSqrtEst = false; - UseReciprocalEst = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; Index: llvm/trunk/lib/Target/X86/X86TargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetMachine.cpp +++ llvm/trunk/lib/Target/X86/X86TargetMachine.cpp @@ -105,6 +105,13 @@ if (Subtarget.isTargetWin64()) this->Options.TrapUnreachable = true; + // TODO: By default, all reciprocal estimate operations are off because + // that matches the behavior before TargetRecip was added (except for btver2 + // which used subtarget features to enable this type of codegen). + // We should change this to match GCC behavior where everything but + // scalar division estimates are turned on by default with -ffast-math. + this->Options.Reciprocals.setDefaults("all", false, 1); + initAsmInfo(); } Index: llvm/trunk/test/CodeGen/X86/recip-fastmath.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/recip-fastmath.ll +++ llvm/trunk/test/CodeGen/X86/recip-fastmath.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est | FileCheck %s --check-prefix=RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf,vec-divf | FileCheck %s --check-prefix=RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf:2,vec-divf:2 | FileCheck %s --check-prefix=REFINE ; If the target's divss/divps instructions are substantially ; slower than rcpss/rcpps with a Newton-Raphson refinement, Index: llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll +++ llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-sqrt-est | FileCheck %s --check-prefix=ESTIMATE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=sqrtf,vec-sqrtf | FileCheck %s --check-prefix=ESTIMATE declare double @__sqrt_finite(double) #0 declare float @__sqrtf_finite(float) #0