diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -351,6 +351,11 @@ : SubtargetFeature<"macrofusion", "HasMacroFusion", "true", "Various instructions can be fused with conditional branches">; +// Bulldozer and newer processors can merge CMP/TEST with conditional branches. +def FeatureBranchFusion + : SubtargetFeature<"branchfusion", "HasBranchFusion", "true", + "CMP/TEST can be fused with conditional branches">; + // Gather is available since Haswell (AVX2 set). So technically, we can // generate Gathers on all AVX2 processors. But the overhead on HSW is high. // Skylake Client processor has faster Gathers than HSW and performance is @@ -810,7 +815,7 @@ FeatureSlowSHLD, FeatureLAHFSAHF, FeatureFast11ByteNOP, - FeatureMacroFusion]; + FeatureBranchFusion]; list BdVer1Features = BdVer1InheritableFeatures; // PileDriver @@ -860,7 +865,7 @@ FeatureLZCNT, FeatureFastBEXTR, FeatureFast15ByteNOP, - FeatureMacroFusion, + FeatureBranchFusion, FeatureMMX, FeatureMOVBE, FeatureMWAITX, diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp --- a/llvm/lib/Target/X86/X86MacroFusion.cpp +++ b/llvm/lib/Target/X86/X86MacroFusion.cpp @@ -18,59 +18,30 @@ using namespace llvm; -/// Check if the instr pair, FirstMI and SecondMI, should be fused -/// together. Given SecondMI, when FirstMI is unspecified, then check if -/// SecondMI may be part of a fused pair at all. -static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, - const TargetSubtargetInfo &TSI, - const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - const X86Subtarget &ST = static_cast(TSI); - // Check if this processor supports macro-fusion. - if (!ST.hasMacroFusion()) - return false; +namespace { - enum { - FuseTest, - FuseCmp, - FuseInc - } FuseKind; +enum class FirstKind { + Test, + Cmp, + And, + ALU, + IncDec, + Invalid +}; - unsigned FirstOpcode = FirstMI - ? FirstMI->getOpcode() - : static_cast(X86::INSTRUCTION_LIST_END); - unsigned SecondOpcode = SecondMI.getOpcode(); +enum class SecondKind { + ELG, + AB, + SPO, + Invalid, +}; - switch (SecondOpcode) { - default: - return false; - case X86::JE_1: - case X86::JNE_1: - case X86::JL_1: - case X86::JLE_1: - case X86::JG_1: - case X86::JGE_1: - FuseKind = FuseInc; - break; - case X86::JB_1: - case X86::JBE_1: - case X86::JA_1: - case X86::JAE_1: - FuseKind = FuseCmp; - break; - case X86::JS_1: - case X86::JNS_1: - case X86::JP_1: - case X86::JNP_1: - case X86::JO_1: - case X86::JNO_1: - FuseKind = FuseTest; - break; - } +} // namespace - switch (FirstOpcode) { +static FirstKind classifyFirst(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: - return false; + return FirstKind::Invalid; case X86::TEST8rr: case X86::TEST16rr: case X86::TEST32rr: @@ -83,6 +54,7 @@ case X86::TEST16mr: case X86::TEST32mr: case X86::TEST64mr: + return FirstKind::Test; case X86::AND16ri: case X86::AND16ri8: case X86::AND16rm: @@ -98,7 +70,7 @@ case X86::AND8ri: case X86::AND8rm: case X86::AND8rr: - return true; + return FirstKind::And; case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP16rm: @@ -118,6 +90,7 @@ case X86::CMP8rm: case X86::CMP8rr: case X86::CMP8mr: + return FirstKind::Cmp; case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri8_DB: @@ -159,7 +132,7 @@ case X86::SUB8ri: case X86::SUB8rm: case X86::SUB8rr: - return FuseKind == FuseCmp || FuseKind == FuseInc; + return FirstKind::ALU; case X86::INC16r: case X86::INC32r: case X86::INC64r: @@ -168,12 +141,84 @@ case X86::DEC32r: case X86::DEC64r: case X86::DEC8r: - return FuseKind == FuseInc; - case X86::INSTRUCTION_LIST_END: - return true; + return FirstKind::IncDec; + } +} + +static SecondKind classifySecond(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return SecondKind::Invalid; + case X86::JE_1: + case X86::JNE_1: + case X86::JL_1: + case X86::JLE_1: + case X86::JG_1: + case X86::JGE_1: + return SecondKind::ELG; + case X86::JB_1: + case X86::JBE_1: + case X86::JA_1: + case X86::JAE_1: + return SecondKind::AB; + case X86::JS_1: + case X86::JNS_1: + case X86::JP_1: + case X86::JNP_1: + case X86::JO_1: + case X86::JNO_1: + return SecondKind::SPO; } } +/// Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const X86Subtarget &ST = static_cast(TSI); + + // Check if this processor supports any kind of fusion. + if (!(ST.hasBranchFusion() || ST.hasMacroFusion())) + return false; + + const SecondKind BranchKind = classifySecond(SecondMI); + + if (BranchKind == SecondKind::Invalid) + return false; // Second cannot be fused with anything. + + if (FirstMI == nullptr) + return true; // We're only checking whether Second can be fused at all. + + const FirstKind TestKind = classifyFirst(*FirstMI); + + if (ST.hasBranchFusion()) { + // Branch fusion can merge CMP and TEST with all conditional jumps. + return (TestKind == FirstKind::Cmp || TestKind == FirstKind::Test); + } + + if (ST.hasMacroFusion()) { + // Macro Fusion rules are a bit more complex. See Agner Fog's + // Microarchitecture table 9.2 "Instruction Fusion". + switch (TestKind) { + case FirstKind::Test: + case FirstKind::And: + return true; + case FirstKind::Cmp: + case FirstKind::ALU: + return BranchKind == SecondKind::ELG || BranchKind == SecondKind::AB; + case FirstKind::IncDec: + return BranchKind == SecondKind::ELG; + case FirstKind::Invalid: + return false; + } + } + + llvm_unreachable(""); +} + namespace llvm { std::unique_ptr diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -297,6 +297,9 @@ /// True if the processor supports macrofusion. bool HasMacroFusion = false; + /// True if the processor supports branch fusion. + bool HasBranchFusion = false; + /// True if the processor has enhanced REP MOVSB/STOSB. bool HasERMSB = false; @@ -642,6 +645,7 @@ bool hasFastBEXTR() const { return HasFastBEXTR; } bool hasFastHorizontalOps() const { return HasFastHorizontalOps; } bool hasMacroFusion() const { return HasMacroFusion; } + bool hasBranchFusion() const { return HasBranchFusion; } bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -59,6 +59,7 @@ X86::FeatureLEAUsesAG, X86::FeatureLZCNTFalseDeps, X86::FeatureMacroFusion, + X86::FeatureBranchFusion, X86::FeatureMergeToThreeWayBranch, X86::FeaturePadShortFunctions, X86::FeaturePOPCNTFalseDeps, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2984,7 +2984,7 @@ } bool X86TTIImpl::canMacroFuseCmp() { - return ST->hasMacroFusion(); + return ST->hasMacroFusion() || ST->hasBranchFusion(); } bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll --- a/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll @@ -2,8 +2,9 @@ ; RUN: opt < %s -loop-reduce -mcpu=btver2 -S | FileCheck %s --check-prefix=JAG ; RUN: opt < %s -loop-reduce -mcpu=haswell -S | FileCheck %s --check-prefix=HSW -; RUN: llc < %s | FileCheck %s --check-prefix=BASE -; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE +; RUN: llc < %s | FileCheck %s --check-prefix=BASE +; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE +; RUN: llc < %s -mattr=branchfusion | FileCheck %s --check-prefix=FUSE target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown"