diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -351,6 +351,11 @@ : SubtargetFeature<"macrofusion", "HasMacroFusion", "true", "Various instructions can be fused with conditional branches">; +// Bulldozer and newer processors can merge CMP/TEST with conditional branches. +def FeatureBranchFusion + : SubtargetFeature<"branchfusion", "HasBranchFusion", "true", + "CMP/TEST can be fused with conditional branches">; + // Gather is available since Haswell (AVX2 set). So technically, we can // generate Gathers on all AVX2 processors. But the overhead on HSW is high. // Skylake Client processor has faster Gathers than HSW and performance is @@ -810,7 +815,7 @@ FeatureSlowSHLD, FeatureLAHFSAHF, FeatureFast11ByteNOP, - FeatureMacroFusion]; + FeatureBranchFusion]; list BdVer1Features = BdVer1InheritableFeatures; // PileDriver @@ -860,7 +865,7 @@ FeatureLZCNT, FeatureFastBEXTR, FeatureFast15ByteNOP, - FeatureMacroFusion, + FeatureBranchFusion, FeatureMMX, FeatureMOVBE, FeatureMWAITX, diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp --- a/llvm/lib/Target/X86/X86MacroFusion.cpp +++ b/llvm/lib/Target/X86/X86MacroFusion.cpp @@ -18,59 +18,23 @@ using namespace llvm; -/// Check if the instr pair, FirstMI and SecondMI, should be fused -/// together. Given SecondMI, when FirstMI is unspecified, then check if -/// SecondMI may be part of a fused pair at all. -static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, - const TargetSubtargetInfo &TSI, - const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - const X86Subtarget &ST = static_cast(TSI); - // Check if this processor supports macro-fusion. - if (!ST.hasMacroFusion()) - return false; +namespace { - enum { - FuseTest, - FuseCmp, - FuseInc - } FuseKind; +enum class FirstKind { Test, Cmp, And, ALU, IncDec, Invalid }; - unsigned FirstOpcode = FirstMI - ? FirstMI->getOpcode() - : static_cast(X86::INSTRUCTION_LIST_END); - unsigned SecondOpcode = SecondMI.getOpcode(); +enum class SecondKind { + ELG, + AB, + SPO, + Invalid, +}; - switch (SecondOpcode) { - default: - return false; - case X86::JE_1: - case X86::JNE_1: - case X86::JL_1: - case X86::JLE_1: - case X86::JG_1: - case X86::JGE_1: - FuseKind = FuseInc; - break; - case X86::JB_1: - case X86::JBE_1: - case X86::JA_1: - case X86::JAE_1: - FuseKind = FuseCmp; - break; - case X86::JS_1: - case X86::JNS_1: - case X86::JP_1: - case X86::JNP_1: - case X86::JO_1: - case X86::JNO_1: - FuseKind = FuseTest; - break; - } +} // namespace - switch (FirstOpcode) { +static FirstKind classifyFirst(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: - return false; + return FirstKind::Invalid; case X86::TEST8rr: case X86::TEST16rr: case X86::TEST32rr: @@ -83,6 +47,7 @@ case X86::TEST16mr: case X86::TEST32mr: case X86::TEST64mr: + return FirstKind::Test; case X86::AND16ri: case X86::AND16ri8: case X86::AND16rm: @@ -98,7 +63,7 @@ case X86::AND8ri: case X86::AND8rm: case X86::AND8rr: - return true; + return FirstKind::And; case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP16rm: @@ -118,6 +83,7 @@ case X86::CMP8rm: case X86::CMP8rr: case X86::CMP8mr: + return FirstKind::Cmp; case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri8_DB: @@ -159,7 +125,7 @@ case X86::SUB8ri: case X86::SUB8rm: case X86::SUB8rr: - return FuseKind == FuseCmp || FuseKind == FuseInc; + return FirstKind::ALU; case X86::INC16r: case X86::INC32r: case X86::INC64r: @@ -168,10 +134,82 @@ case X86::DEC32r: case X86::DEC64r: case X86::DEC8r: - return FuseKind == FuseInc; - case X86::INSTRUCTION_LIST_END: - return true; + return FirstKind::IncDec; + } +} + +static SecondKind classifySecond(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return SecondKind::Invalid; + case X86::JE_1: + case X86::JNE_1: + case X86::JL_1: + case X86::JLE_1: + case X86::JG_1: + case X86::JGE_1: + return SecondKind::ELG; + case X86::JB_1: + case X86::JBE_1: + case X86::JA_1: + case X86::JAE_1: + return SecondKind::AB; + case X86::JS_1: + case X86::JNS_1: + case X86::JP_1: + case X86::JNP_1: + case X86::JO_1: + case X86::JNO_1: + return SecondKind::SPO; + } +} + +/// Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const X86Subtarget &ST = static_cast(TSI); + + // Check if this processor supports any kind of fusion. + if (!(ST.hasBranchFusion() || ST.hasMacroFusion())) + return false; + + const SecondKind BranchKind = classifySecond(SecondMI); + + if (BranchKind == SecondKind::Invalid) + return false; // Second cannot be fused with anything. + + if (FirstMI == nullptr) + return true; // We're only checking whether Second can be fused at all. + + const FirstKind TestKind = classifyFirst(*FirstMI); + + if (ST.hasBranchFusion()) { + // Branch fusion can merge CMP and TEST with all conditional jumps. + return (TestKind == FirstKind::Cmp || TestKind == FirstKind::Test); + } + + if (ST.hasMacroFusion()) { + // Macro Fusion rules are a bit more complex. See Agner Fog's + // Microarchitecture table 9.2 "Instruction Fusion". + switch (TestKind) { + case FirstKind::Test: + case FirstKind::And: + return true; + case FirstKind::Cmp: + case FirstKind::ALU: + return BranchKind == SecondKind::ELG || BranchKind == SecondKind::AB; + case FirstKind::IncDec: + return BranchKind == SecondKind::ELG; + case FirstKind::Invalid: + return false; + } } + + llvm_unreachable(""); } namespace llvm { diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -297,6 +297,9 @@ /// True if the processor supports macrofusion. bool HasMacroFusion = false; + /// True if the processor supports branch fusion. + bool HasBranchFusion = false; + /// True if the processor has enhanced REP MOVSB/STOSB. bool HasERMSB = false; @@ -642,6 +645,7 @@ bool hasFastBEXTR() const { return HasFastBEXTR; } bool hasFastHorizontalOps() const { return HasFastHorizontalOps; } bool hasMacroFusion() const { return HasMacroFusion; } + bool hasBranchFusion() const { return HasBranchFusion; } bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -36,57 +36,59 @@ const X86TargetLowering *getTLI() const { return TLI; } const FeatureBitset InlineFeatureIgnoreList = { - // This indicates the CPU is 64 bit capable not that we are in 64-bit mode. - X86::Feature64Bit, - - // These features don't have any intrinsics or ABI effect. - X86::FeatureNOPL, - X86::FeatureCMPXCHG16B, - X86::FeatureLAHFSAHF, - - // Codegen control options. - X86::FeatureFast11ByteNOP, - X86::FeatureFast15ByteNOP, - X86::FeatureFastBEXTR, - X86::FeatureFastHorizontalOps, - X86::FeatureFastLZCNT, - X86::FeatureFastPartialYMMorZMMWrite, - X86::FeatureFastScalarFSQRT, - X86::FeatureFastSHLDRotate, - X86::FeatureFastVariableShuffle, - X86::FeatureFastVectorFSQRT, - X86::FeatureLEAForSP, - X86::FeatureLEAUsesAG, - X86::FeatureLZCNTFalseDeps, - X86::FeatureMacroFusion, - X86::FeatureMergeToThreeWayBranch, - X86::FeaturePadShortFunctions, - X86::FeaturePOPCNTFalseDeps, - X86::FeatureSSEUnalignedMem, - X86::FeatureSlow3OpsLEA, - X86::FeatureSlowDivide32, - X86::FeatureSlowDivide64, - X86::FeatureSlowIncDec, - X86::FeatureSlowLEA, - X86::FeatureSlowPMADDWD, - X86::FeatureSlowPMULLD, - X86::FeatureSlowSHLD, - X86::FeatureSlowTwoMemOps, - X86::FeatureSlowUAMem16, - - // Perf-tuning flags. - X86::FeatureHasFastGather, - X86::FeatureSlowUAMem32, - - // Based on whether user set the -mprefer-vector-width command line. - X86::FeaturePrefer256Bit, - - // CPU name enums. These just follow CPU string. - X86::ProcIntelAtom, - X86::ProcIntelGLM, - X86::ProcIntelGLP, - X86::ProcIntelSLM, - X86::ProcIntelTRM, + // This indicates the CPU is 64 bit capable not that we are in 64-bit + // mode. + X86::Feature64Bit, + + // These features don't have any intrinsics or ABI effect. + X86::FeatureNOPL, + X86::FeatureCMPXCHG16B, + X86::FeatureLAHFSAHF, + + // Codegen control options. + X86::FeatureFast11ByteNOP, + X86::FeatureFast15ByteNOP, + X86::FeatureFastBEXTR, + X86::FeatureFastHorizontalOps, + X86::FeatureFastLZCNT, + X86::FeatureFastPartialYMMorZMMWrite, + X86::FeatureFastScalarFSQRT, + X86::FeatureFastSHLDRotate, + X86::FeatureFastVariableShuffle, + X86::FeatureFastVectorFSQRT, + X86::FeatureLEAForSP, + X86::FeatureLEAUsesAG, + X86::FeatureLZCNTFalseDeps, + X86::FeatureMacroFusion, + X86::FeatureBranchFusion, + X86::FeatureMergeToThreeWayBranch, + X86::FeaturePadShortFunctions, + X86::FeaturePOPCNTFalseDeps, + X86::FeatureSSEUnalignedMem, + X86::FeatureSlow3OpsLEA, + X86::FeatureSlowDivide32, + X86::FeatureSlowDivide64, + X86::FeatureSlowIncDec, + X86::FeatureSlowLEA, + X86::FeatureSlowPMADDWD, + X86::FeatureSlowPMULLD, + X86::FeatureSlowSHLD, + X86::FeatureSlowTwoMemOps, + X86::FeatureSlowUAMem16, + + // Perf-tuning flags. + X86::FeatureHasFastGather, + X86::FeatureSlowUAMem32, + + // Based on whether user set the -mprefer-vector-width command line. + X86::FeaturePrefer256Bit, + + // CPU name enums. These just follow CPU string. + X86::ProcIntelAtom, + X86::ProcIntelGLM, + X86::ProcIntelGLP, + X86::ProcIntelSLM, + X86::ProcIntelTRM, }; public: diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2984,7 +2984,7 @@ } bool X86TTIImpl::canMacroFuseCmp() { - return ST->hasMacroFusion(); + return ST->hasMacroFusion() || ST->hasBranchFusion(); } bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll --- a/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll @@ -2,8 +2,9 @@ ; RUN: opt < %s -loop-reduce -mcpu=btver2 -S | FileCheck %s --check-prefix=JAG ; RUN: opt < %s -loop-reduce -mcpu=haswell -S | FileCheck %s --check-prefix=HSW -; RUN: llc < %s | FileCheck %s --check-prefix=BASE -; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE +; RUN: llc < %s | FileCheck %s --check-prefix=BASE +; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE +; RUN: llc < %s -mattr=branchfusion | FileCheck %s --check-prefix=FUSE target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown"