Index: llvm/trunk/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64.td +++ llvm/trunk/lib/Target/AArch64/AArch64.td @@ -94,9 +94,13 @@ "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern", "true", "Use alternative pattern for sextload convert to f32">; -def FeatureMacroOpFusion : SubtargetFeature< - "macroop-fusion", "HasMacroOpFusion", "true", - "CPU supports macro op fusion">; +def FeatureArithmeticBccFusion : SubtargetFeature< + "arith-bcc-fusion", "HasArithmeticBccFusion", "true", + "CPU fuses arithmetic+bcc operations">; + +def FeatureArithmeticCbzFusion : SubtargetFeature< + "arith-cbz-fusion", "HasArithmeticCbzFusion", "true", + "CPU fuses arithmetic + cbz/cbnz operations">; def FeatureDisableLatencySchedHeuristic : SubtargetFeature< "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", @@ -204,7 +208,8 @@ FeatureCrypto, FeatureDisableLatencySchedHeuristic, FeatureFPARMv8, - FeatureMacroOpFusion, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, FeatureNEON, FeaturePerfMon, FeatureSlowMisaligned128Store, @@ -244,7 +249,7 @@ FeatureCRC, FeatureCrypto, FeatureFPARMv8, - FeatureMacroOpFusion, + FeatureArithmeticBccFusion, FeatureNEON, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1876,39 +1876,80 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr &First, MachineInstr &Second) const { - if (Subtarget.hasMacroOpFusion()) { + if (Subtarget.hasArithmeticBccFusion()) { // Fuse CMN, CMP, TST followed by Bcc. unsigned SecondOpcode = Second.getOpcode(); if (SecondOpcode == AArch64::Bcc) { switch (First.getOpcode()) { default: return false; - case AArch64::SUBSWri: case AArch64::ADDSWri: - case AArch64::ANDSWri: - case AArch64::SUBSXri: + case AArch64::ADDSWrr: case AArch64::ADDSXri: + case AArch64::ADDSXrr: + case AArch64::ANDSWri: + case AArch64::ANDSWrr: case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::SUBSWri: + case AArch64::SUBSWrr: + case AArch64::SUBSXri: + case AArch64::SUBSXrr: + case AArch64::BICSWrr: + case AArch64::BICSXrr: return true; + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: + case AArch64::ANDSWrs: + case AArch64::ANDSXrs: + case AArch64::SUBSWrs: + case AArch64::SUBSXrs: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + // Shift value can be 0 making these behave like the "rr" variant... + return !hasShiftedReg(Second); } } + } + if (Subtarget.hasArithmeticCbzFusion()) { // Fuse ALU operations followed by CBZ/CBNZ. + unsigned SecondOpcode = Second.getOpcode(); if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { switch (First.getOpcode()) { default: return false; case AArch64::ADDWri: + case AArch64::ADDWrr: case AArch64::ADDXri: + case AArch64::ADDXrr: case AArch64::ANDWri: + case AArch64::ANDWrr: case AArch64::ANDXri: + case AArch64::ANDXrr: case AArch64::EORWri: + case AArch64::EORWrr: case AArch64::EORXri: + case AArch64::EORXrr: case AArch64::ORRWri: + case AArch64::ORRWrr: case AArch64::ORRXri: + case AArch64::ORRXrr: case AArch64::SUBWri: + case AArch64::SUBWrr: case AArch64::SUBXri: + case AArch64::SUBXrr: return true; + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + // Shift value can be 0 making these behave like the "rr" variant... + return !hasShiftedReg(Second); } } } Index: llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h @@ -80,7 +80,8 @@ bool Misaligned128StoreIsSlow = false; bool AvoidQuadLdStPairs = false; bool UseAlternateSExtLoadCVTF32Pattern = false; - bool HasMacroOpFusion = false; + bool HasArithmeticBccFusion = false; + bool HasArithmeticCbzFusion = false; bool DisableLatencySchedHeuristic = false; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 3; @@ -188,7 +189,8 @@ bool useAlternateSExtLoadCVTF32Pattern() const { return UseAlternateSExtLoadCVTF32Pattern; } - bool hasMacroOpFusion() const { return HasMacroOpFusion; } + bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; } + bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } unsigned getVectorInsertExtractBaseCost() const { return VectorInsertExtractBaseCost; Index: llvm/trunk/test/CodeGen/AArch64/misched-fusion.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/misched-fusion.ll +++ llvm/trunk/test/CodeGen/AArch64/misched-fusion.ll @@ -1,4 +1,4 @@ -; RUN: llc -o - %s -mattr=+macroop-fusion,+use-postra-scheduler | FileCheck %s +; RUN: llc -o - %s -mattr=+arith-cbz-fusion,+use-postra-scheduler | FileCheck %s ; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s target triple = "arm64-apple-ios"