Index: llvm/trunk/lib/Target/X86/X86.td =================================================================== --- llvm/trunk/lib/Target/X86/X86.td +++ llvm/trunk/lib/Target/X86/X86.td @@ -344,6 +344,12 @@ "ermsb", "HasERMSB", "true", "REP MOVS/STOS are fast">; +// Bulldozer and newer processors can merge CMP/TEST (but not other +// instructions) with conditional branches. +def FeatureBranchFusion + : SubtargetFeature<"branchfusion", "HasBranchFusion", "true", + "CMP/TEST can be fused with conditional branches">; + // Sandy Bridge and newer processors have many instructions that can be // fused with conditional branches and pass through the CPU as a single // operation. @@ -810,7 +816,7 @@ FeatureSlowSHLD, FeatureLAHFSAHF, FeatureFast11ByteNOP, - FeatureMacroFusion]; + FeatureBranchFusion]; list BdVer1Features = BdVer1InheritableFeatures; // PileDriver @@ -860,7 +866,7 @@ FeatureLZCNT, FeatureFastBEXTR, FeatureFast15ByteNOP, - FeatureMacroFusion, + FeatureBranchFusion, FeatureMMX, FeatureMOVBE, FeatureMWAITX, Index: llvm/trunk/lib/Target/X86/X86MacroFusion.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86MacroFusion.cpp +++ llvm/trunk/lib/Target/X86/X86MacroFusion.cpp @@ -18,59 +18,29 @@ using namespace llvm; -/// Check if the instr pair, FirstMI and SecondMI, should be fused -/// together. Given SecondMI, when FirstMI is unspecified, then check if -/// SecondMI may be part of a fused pair at all. -static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, - const TargetSubtargetInfo &TSI, - const MachineInstr *FirstMI, - const MachineInstr &SecondMI) { - const X86Subtarget &ST = static_cast(TSI); - // Check if this processor supports macro-fusion. - if (!ST.hasMacroFusion()) - return false; +namespace { - enum { - FuseTest, - FuseCmp, - FuseInc - } FuseKind; - - unsigned FirstOpcode = FirstMI - ? FirstMI->getOpcode() - : static_cast(X86::INSTRUCTION_LIST_END); - unsigned SecondOpcode = SecondMI.getOpcode(); +// The classification for the first instruction. +enum class FirstInstrKind { Test, Cmp, And, ALU, IncDec, Invalid }; - switch (SecondOpcode) { - default: - return false; - case X86::JE_1: - case X86::JNE_1: - case X86::JL_1: - case X86::JLE_1: - case X86::JG_1: - case X86::JGE_1: - FuseKind = FuseInc; - break; - case X86::JB_1: - case X86::JBE_1: - case X86::JA_1: - case X86::JAE_1: - FuseKind = FuseCmp; - break; - case X86::JS_1: - case X86::JNS_1: - case X86::JP_1: - case X86::JNP_1: - case X86::JO_1: - case X86::JNO_1: - FuseKind = FuseTest; - break; - } +// The classification for the second instruction (jump). +enum class JumpKind { + // JE, JL, JG and variants. + ELG, + // JA, JB and variants. + AB, + // JS, JP, JO and variants. + SPO, + // Not a fusable jump. + Invalid, +}; - switch (FirstOpcode) { +} // namespace + +static FirstInstrKind classifyFirst(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: - return false; + return FirstInstrKind::Invalid; case X86::TEST8rr: case X86::TEST16rr: case X86::TEST32rr: @@ -83,6 +53,7 @@ case X86::TEST16mr: case X86::TEST32mr: case X86::TEST64mr: + return FirstInstrKind::Test; case X86::AND16ri: case X86::AND16ri8: case X86::AND16rm: @@ -98,7 +69,7 @@ case X86::AND8ri: case X86::AND8rm: case X86::AND8rr: - return true; + return FirstInstrKind::And; case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP16rm: @@ -118,6 +89,7 @@ case X86::CMP8rm: case X86::CMP8rr: case X86::CMP8mr: + return FirstInstrKind::Cmp; case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri8_DB: @@ -159,7 +131,7 @@ case X86::SUB8ri: case X86::SUB8rm: case X86::SUB8rr: - return FuseKind == FuseCmp || FuseKind == FuseInc; + return FirstInstrKind::ALU; case X86::INC16r: case X86::INC32r: case X86::INC64r: @@ -168,12 +140,85 @@ case X86::DEC32r: case X86::DEC64r: case X86::DEC8r: - return FuseKind == FuseInc; - case X86::INSTRUCTION_LIST_END: - return true; + return FirstInstrKind::IncDec; } } +static JumpKind classifySecond(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return JumpKind::Invalid; + case X86::JE_1: + case X86::JNE_1: + case X86::JL_1: + case X86::JLE_1: + case X86::JG_1: + case X86::JGE_1: + return JumpKind::ELG; + case X86::JB_1: + case X86::JBE_1: + case X86::JA_1: + case X86::JAE_1: + return JumpKind::AB; + case X86::JS_1: + case X86::JNS_1: + case X86::JP_1: + case X86::JNP_1: + case X86::JO_1: + case X86::JNO_1: + return JumpKind::SPO; + } +} + +/// Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const X86Subtarget &ST = static_cast(TSI); + + // Check if this processor supports any kind of fusion. + if (!(ST.hasBranchFusion() || ST.hasMacroFusion())) + return false; + + const JumpKind BranchKind = classifySecond(SecondMI); + + if (BranchKind == JumpKind::Invalid) + return false; // Second cannot be fused with anything. + + if (FirstMI == nullptr) + return true; // We're only checking whether Second can be fused at all. + + const FirstInstrKind TestKind = classifyFirst(*FirstMI); + + if (ST.hasBranchFusion()) { + // Branch fusion can merge CMP and TEST with all conditional jumps. + return (TestKind == FirstInstrKind::Cmp || + TestKind == FirstInstrKind::Test); + } + + if (ST.hasMacroFusion()) { + // Macro Fusion rules are a bit more complex. See Agner Fog's + // Microarchitecture table 9.2 "Instruction Fusion". + switch (TestKind) { + case FirstInstrKind::Test: + case FirstInstrKind::And: + return true; + case FirstInstrKind::Cmp: + case FirstInstrKind::ALU: + return BranchKind == JumpKind::ELG || BranchKind == JumpKind::AB; + case FirstInstrKind::IncDec: + return BranchKind == JumpKind::ELG; + case FirstInstrKind::Invalid: + return false; + } + } + + llvm_unreachable("unknown branch fusion type"); +} + namespace llvm { std::unique_ptr Index: llvm/trunk/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.h +++ llvm/trunk/lib/Target/X86/X86Subtarget.h @@ -297,6 +297,9 @@ /// True if the processor supports macrofusion. bool HasMacroFusion = false; + /// True if the processor supports branch fusion. + bool HasBranchFusion = false; + /// True if the processor has enhanced REP MOVSB/STOSB. bool HasERMSB = false; @@ -642,6 +645,7 @@ bool hasFastBEXTR() const { return HasFastBEXTR; } bool hasFastHorizontalOps() const { return HasFastHorizontalOps; } bool hasMacroFusion() const { return HasMacroFusion; } + bool hasBranchFusion() const { return HasBranchFusion; } bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h @@ -59,6 +59,7 @@ X86::FeatureLEAForSP, X86::FeatureLEAUsesAG, X86::FeatureLZCNTFalseDeps, + X86::FeatureBranchFusion, X86::FeatureMacroFusion, X86::FeatureMergeToThreeWayBranch, X86::FeaturePadShortFunctions, Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2984,7 +2984,7 @@ } bool X86TTIImpl::canMacroFuseCmp() { - return ST->hasMacroFusion(); + return ST->hasMacroFusion() || ST->hasBranchFusion(); } bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { Index: llvm/trunk/test/CodeGen/X86/testb-je-fusion.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/testb-je-fusion.ll +++ llvm/trunk/test/CodeGen/X86/testb-je-fusion.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -mattr=-macrofusion | FileCheck %s --check-prefix=NOFUSION -; RUN: llc < %s -mtriple=x86_64-- -mattr=+macrofusion | FileCheck %s --check-prefix=MACROFUSION +; RUN: llc < %s -mtriple=x86_64-- -mattr=-macrofusion,-branchfusion | FileCheck %s --check-prefix=NOFUSION +; RUN: llc < %s -mtriple=x86_64-- -mattr=-macrofusion,+branchfusion | FileCheck %s --check-prefix=BRANCHFUSION --check-prefix=BRANCHFUSIONONLY +; RUN: llc < %s -mtriple=x86_64-- -mattr=+macrofusion,-branchfusion | FileCheck %s --check-prefix=BRANCHFUSION --check-prefix=MACROFUSION ; testb should be scheduled right before je to enable macro-fusion. @@ -16,16 +17,16 @@ ; NOFUSION-NEXT: .LBB0_2: # %if.end ; NOFUSION-NEXT: retq ; -; MACROFUSION-LABEL: macrofuse_test_je: -; MACROFUSION: # %bb.0: # %entry -; MACROFUSION-NEXT: xorl %eax, %eax -; MACROFUSION-NEXT: movb $1, (%rsi) -; MACROFUSION-NEXT: testl $512, %edi # imm = 0x200 -; MACROFUSION-NEXT: je .LBB0_2 -; MACROFUSION-NEXT: # %bb.1: # %if.then -; MACROFUSION-NEXT: movl $1, %eax -; MACROFUSION-NEXT: .LBB0_2: # %if.end -; MACROFUSION-NEXT: retq +; BRANCHFUSION-LABEL: macrofuse_test_je: +; BRANCHFUSION: # %bb.0: # %entry +; BRANCHFUSION-NEXT: xorl %eax, %eax +; BRANCHFUSION-NEXT: movb $1, (%rsi) +; BRANCHFUSION-NEXT: testl $512, %edi # imm = 0x200 +; BRANCHFUSION-NEXT: je .LBB0_2 +; BRANCHFUSION-NEXT: # %bb.1: # %if.then +; BRANCHFUSION-NEXT: movl $1, %eax +; BRANCHFUSION-NEXT: .LBB0_2: # %if.end +; BRANCHFUSION-NEXT: retq entry: %and = and i32 %flags, 512 %tobool = icmp eq i32 %and, 0 @@ -53,17 +54,17 @@ ; NOFUSION-NEXT: xorl %eax, %eax ; NOFUSION-NEXT: retq ; -; MACROFUSION-LABEL: macrofuse_cmp_je: -; MACROFUSION: # %bb.0: # %entry -; MACROFUSION-NEXT: movb $1, (%rsi) -; MACROFUSION-NEXT: cmpl $512, %edi # imm = 0x200 -; MACROFUSION-NEXT: je .LBB1_1 -; MACROFUSION-NEXT: # %bb.2: # %if.then -; MACROFUSION-NEXT: movl $1, %eax -; MACROFUSION-NEXT: retq -; MACROFUSION-NEXT: .LBB1_1: -; MACROFUSION-NEXT: xorl %eax, %eax -; MACROFUSION-NEXT: retq +; BRANCHFUSION-LABEL: macrofuse_cmp_je: +; BRANCHFUSION: # %bb.0: # %entry +; BRANCHFUSION-NEXT: movb $1, (%rsi) +; BRANCHFUSION-NEXT: cmpl $512, %edi # imm = 0x200 +; BRANCHFUSION-NEXT: je .LBB1_1 +; BRANCHFUSION-NEXT: # %bb.2: # %if.then +; BRANCHFUSION-NEXT: movl $1, %eax +; BRANCHFUSION-NEXT: retq +; BRANCHFUSION-NEXT: .LBB1_1: +; BRANCHFUSION-NEXT: xorl %eax, %eax +; BRANCHFUSION-NEXT: retq entry: %sub = sub i32 %flags, 512 %tobool = icmp eq i32 %sub, 0 @@ -90,6 +91,17 @@ ; NOFUSION-NEXT: .LBB2_2: # %if.end ; NOFUSION-NEXT: retq ; +; BRANCHFUSIONONLY-LABEL: macrofuse_alu_je: +; BRANCHFUSIONONLY: # %bb.0: # %entry +; BRANCHFUSIONONLY-NEXT: movl %edi, %eax +; BRANCHFUSIONONLY-NEXT: addl $-512, %eax # imm = 0xFE00 +; BRANCHFUSIONONLY-NEXT: movb $1, (%rsi) +; BRANCHFUSIONONLY-NEXT: je .LBB2_2 +; BRANCHFUSIONONLY-NEXT: # %bb.1: # %if.then +; BRANCHFUSIONONLY-NEXT: movl $1, %eax +; BRANCHFUSIONONLY-NEXT: .LBB2_2: # %if.end +; BRANCHFUSIONONLY-NEXT: retq +; ; MACROFUSION-LABEL: macrofuse_alu_je: ; MACROFUSION: # %bb.0: # %entry ; MACROFUSION-NEXT: movl %edi, %eax @@ -126,6 +138,17 @@ ; NOFUSION-NEXT: .LBB3_2: # %if.end ; NOFUSION-NEXT: retq ; +; BRANCHFUSIONONLY-LABEL: macrofuse_dec_je: +; BRANCHFUSIONONLY: # %bb.0: # %entry +; BRANCHFUSIONONLY-NEXT: movl %edi, %eax +; BRANCHFUSIONONLY-NEXT: decl %eax +; BRANCHFUSIONONLY-NEXT: movb $1, (%rsi) +; BRANCHFUSIONONLY-NEXT: je .LBB3_2 +; BRANCHFUSIONONLY-NEXT: # %bb.1: # %if.then +; BRANCHFUSIONONLY-NEXT: movl $1, %eax +; BRANCHFUSIONONLY-NEXT: .LBB3_2: # %if.end +; BRANCHFUSIONONLY-NEXT: retq +; ; MACROFUSION-LABEL: macrofuse_dec_je: ; MACROFUSION: # %bb.0: # %entry ; MACROFUSION-NEXT: movl %edi, %eax Index: llvm/trunk/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll =================================================================== --- llvm/trunk/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll +++ llvm/trunk/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll @@ -3,8 +3,9 @@ ; RUN: opt < %s -loop-reduce -mcpu=bdver2 -S | FileCheck %s --check-prefix=BUL ; RUN: opt < %s -loop-reduce -mcpu=haswell -S | FileCheck %s --check-prefix=HSW -; RUN: llc < %s | FileCheck %s --check-prefix=BASE -; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE +; RUN: llc < %s | FileCheck %s --check-prefix=BASE +; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE +; RUN: llc < %s -mattr=branchfusion | FileCheck %s --check-prefix=FUSE target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown"