Index: llvm/trunk/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64.td +++ llvm/trunk/lib/Target/AArch64/AArch64.td @@ -188,14 +188,18 @@ "fuse-aes", "HasFuseAES", "true", "CPU fuses AES crypto operations">; -def FeatureFuseCryptoEOR : SubtargetFeature< - "fuse-crypto-eor", "HasFuseCryptoEOR", "true", - "CPU fuses AES/PMULL and EOR operations">; +def FeatureFuseArithmeticLogic : SubtargetFeature< + "fuse-arith-logic", "HasFuseArithmeticLogic", "true", + "CPU fuses arithmetic and logic operations">; def FeatureFuseCCSelect : SubtargetFeature< "fuse-csel", "HasFuseCCSelect", "true", "CPU fuses conditional select operations">; +def FeatureFuseCryptoEOR : SubtargetFeature< + "fuse-crypto-eor", "HasFuseCryptoEOR", "true", + "CPU fuses AES/PMULL and EOR operations">; + def FeatureFuseLiterals : SubtargetFeature< "fuse-literals", "HasFuseLiterals", "true", "CPU fuses literal generation operations">; Index: llvm/trunk/lib/Target/AArch64/AArch64MacroFusion.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64MacroFusion.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -270,7 +270,107 @@ return false; } -/// Check if the instr pair, FirstMI and SecondMI, should be fused +// Arithmetic and logic. +static bool isArithmeticLogicPair(const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + if (AArch64InstrInfo::hasShiftedReg(SecondMI)) + return false; + + switch (SecondMI.getOpcode()) { + // Arithmetic + case AArch64::ADDWrr: + case AArch64::ADDXrr: + case AArch64::SUBWrr: + case AArch64::SUBXrr: + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + // Logic + case AArch64::ANDWrr: + case AArch64::ANDXrr: + case AArch64::BICWrr: + case AArch64::BICXrr: + case AArch64::EONWrr: + case AArch64::EONXrr: + case AArch64::EORWrr: + case AArch64::EORXrr: + case AArch64::ORNWrr: + case AArch64::ORNXrr: + case AArch64::ORRWrr: + case AArch64::ORRXrr: + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + case AArch64::EONWrs: + case AArch64::EONXrs: + case AArch64::EORWrs: + case AArch64::EORXrs: + case AArch64::ORNWrs: + case AArch64::ORNXrs: + case AArch64::ORRWrs: + case AArch64::ORRXrs: + // Assume the 1st instr to be a wildcard if it is unspecified. + if (FirstMI == nullptr) + return true; + + // Arithmetic + switch (FirstMI->getOpcode()) { + case AArch64::ADDWrr: + case AArch64::ADDXrr: + case AArch64::ADDSWrr: + case AArch64::ADDSXrr: + case AArch64::SUBWrr: + case AArch64::SUBXrr: + case AArch64::SUBSWrr: + case AArch64::SUBSXrr: + return true; + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + case AArch64::SUBSWrs: + case AArch64::SUBSXrs: + return !AArch64InstrInfo::hasShiftedReg(*FirstMI); + } + break; + + // Arithmetic, setting flags. + case AArch64::ADDSWrr: + case AArch64::ADDSXrr: + case AArch64::SUBSWrr: + case AArch64::SUBSXrr: + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: + case AArch64::SUBSWrs: + case AArch64::SUBSXrs: + // Assume the 1st instr to be a wildcard if it is unspecified. + if (FirstMI == nullptr) + return true; + + // Arithmetic, not setting flags. + switch (FirstMI->getOpcode()) { + case AArch64::ADDWrr: + case AArch64::ADDXrr: + case AArch64::SUBWrr: + case AArch64::SUBXrr: + return true; + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + return !AArch64InstrInfo::hasShiftedReg(*FirstMI); + } + break; + } + + return false; +} + +/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused /// together. Given SecondMI, when FirstMI is unspecified, then check if /// SecondMI may be part of a fused pair at all. static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, @@ -295,6 +395,8 @@ return true; if (ST.hasFuseCCSelect() && isCCSelectPair(FirstMI, SecondMI)) return true; + if (ST.hasFuseArithmeticLogic() && isArithmeticLogicPair(FirstMI, SecondMI)) + return true; return false; } Index: llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h @@ -166,8 +166,9 @@ bool HasArithmeticCbzFusion = false; bool HasFuseAddress = false; bool HasFuseAES = false; - bool HasFuseCryptoEOR = false; + bool HasFuseArithmeticLogic = false; bool HasFuseCCSelect = false; + bool HasFuseCryptoEOR = false; bool HasFuseLiterals = false; bool DisableLatencySchedHeuristic = false; bool UseRSqrt = false; @@ -311,14 +312,16 @@ bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; } bool hasFuseAddress() const { return HasFuseAddress; } bool hasFuseAES() const { return HasFuseAES; } - bool hasFuseCryptoEOR() const { return HasFuseCryptoEOR; } + bool hasFuseArithmeticLogic() const { return HasFuseArithmeticLogic; } bool hasFuseCCSelect() const { return HasFuseCCSelect; } + bool hasFuseCryptoEOR() const { return HasFuseCryptoEOR; } bool hasFuseLiterals() const { return HasFuseLiterals; } /// Return true if the CPU supports any kind of instruction fusion. bool hasFusion() const { return hasArithmeticBccFusion() || hasArithmeticCbzFusion() || - hasFuseAES() || hasFuseCCSelect() || hasFuseLiterals(); + hasFuseAES() || hasFuseArithmeticLogic() || + hasFuseCCSelect() || hasFuseLiterals(); } bool useRSqrt() const { return UseRSqrt; } Index: llvm/trunk/test/CodeGen/AArch64/misched-fusion-arith-logic.mir =================================================================== --- llvm/trunk/test/CodeGen/AArch64/misched-fusion-arith-logic.mir +++ llvm/trunk/test/CodeGen/AArch64/misched-fusion-arith-logic.mir @@ -0,0 +1,111 @@ +# RUN: llc -o - %s -mtriple aarch64-unknown -mattr=fuse-arith-logic -run-pass=machine-scheduler -misched-print-dags |& FileCheck %s +# REQUIRES: asserts + +--- +name: arith +body: | + bb.0.entry: + %0:gpr32 = SUBWrr undef $w0, undef $w1 + %1:gpr32 = ADDWrr undef $w1, undef $w2 + %2:gpr32 = SUBWrs %0, undef $w2, 0 + %3:gpr32 = ADDWrs %1, undef $w3, 0 + + ; CHECK: SU(0): %0:gpr32 = SUBWrr undef $w0, undef $w1 + ; CHECK: Successors: + ; CHECK: SU(2): Ord Latency=0 Cluster + ; CHECK: SU(1): %1:gpr32 = ADDWrr undef $w1, undef $w2 + ; CHECK: Successors: + ; CHECK: SU(3): Ord Latency=0 Cluster + ; CHECK: SU(2): dead %2:gpr32 = SUBWrs %0:gpr32, undef $w2, 0 + ; CHECK: Predecessors: + ; CHECK: SU(0): Ord Latency=0 Cluster + ; CHECK: SU(3): dead %3:gpr32 = ADDWrs %1:gpr32, undef $w3, 0 + ; CHECK: Predecessors: + ; CHECK: SU(1): Ord Latency=0 Cluster +... +--- +name: compare +body: | + bb.0.entry: + %0:gpr64 = ADDXrr undef $x0, undef $x1 + %1:gpr64 = SUBXrs undef $x1, undef $x2, 0 + %2:gpr64 = ADDSXrr %0, undef $x3, implicit-def $nzcv + %3:gpr64 = SUBSXrs %1, undef $x4, 0, implicit-def $nzcv + + ; CHECK: SU(0): %0:gpr64 = ADDXrr undef $x0, undef $x1 + ; CHECK: Successors: + ; CHECK: SU(2): Ord Latency=0 Cluster + ; CHECK: SU(1): %1:gpr64 = SUBXrs undef $x1, undef $x2, 0 + ; CHECK: Successors: + ; CHECK: SU(3): Ord Latency=0 Cluster + ; CHECK: SU(2): dead %2:gpr64 = ADDSXrr %0:gpr64, undef $x3, implicit-def $nzcv + ; CHECK: Predecessors: + ; CHECK: SU(0): Ord Latency=0 Cluster + ; CHECK: SU(3): dead %3:gpr64 = SUBSXrs %1:gpr64, undef $x4, 0, implicit-def $nzcv + ; CHECK: Predecessors: + ; CHECK: SU(1): Ord Latency=0 Cluster +... +--- +name: logic +body: | + bb.0.entry: + %0:gpr32 = ADDWrr undef $w0, undef $w1 + %1:gpr64 = SUBXrs undef $x1, undef $x2, 0 + %3:gpr32 = ANDWrs %0, undef $w3, 0 + %4:gpr64 = ORRXrr %1, undef $x4 + + ; CHECK: SU(0): %0:gpr32 = ADDWrr undef $w0, undef $w1 + ; CHECK: Successors: + ; CHECK: SU(2): Ord Latency=0 Cluster + ; CHECK: SU(1): %1:gpr64 = SUBXrs undef $x1, undef $x2, 0 + ; CHECK: Successors: + ; CHECK: SU(3): Ord Latency=0 Cluster + ; CHECK: SU(2): dead %2:gpr32 = ANDWrs %0:gpr32, undef $w3, 0 + ; CHECK: Predecessors: + ; CHECK: SU(0): Ord Latency=0 Cluster + ; CHECK: SU(3): dead %3:gpr64 = ORRXrr %1:gpr64, undef $x4 + ; CHECK: Predecessors: + ; CHECK: SU(1): Ord Latency=0 Cluster +... +--- +name: nope +body: | + bb.0.entry: + ; Shifted register. + %0:gpr32 = SUBWrr undef $w0, undef $w1 + %1:gpr32 = SUBWrs %0, undef $w2, 1 + ; CHECKSU(0)%0:gpr32 = SUBWrr undef $w0, undef $w1 + ; CHECKSuccessors: + ; CHECK-NOTSU(1)Ord Latency=0 Cluster + ; CHECKSU(1)dead %1:gpr32 = SUBWrs %0:gpr32, undef $w2, 1 + + ; Multiple successors. + %2:gpr64 = ADDXrr undef $x0, undef $x1 + %3:gpr32 = EXTRACT_SUBREG %2, %subreg.sub_32 + %4:gpr32 = ANDWrs %3, undef $w2, 0 + %5:gpr64 = ADDSXrr %2, undef $x3, implicit-def $nzcv + ; CHECKSU(2)%2:gpr64 = ADDXrr undef $x0, undef $x1 + ; CHECKSuccessors: + ; CHECK-NOTSU(3)Ord Latency=0 Cluster + ; CHECKSU(5)Ord Latency=0 Cluster + ; CHECKSU(3)%3:gpr32 = EXTRACT_SUBREG %2:gpr64, %subreg.sub_32 + ; CHECKSU(5)dead %5:gpr64 = ADDSXrr %2:gpr64, undef $x3, implicit-def $nzcv + + ; Different register sizes. + %6:gpr32 = SUBWrr undef $w0, undef $w1 + %7:gpr64 = ADDXrr undef $x1, undef $x2 + %8:gpr64 = SUBXrr %7, undef $x3 + %9:gpr32 = ADDWrr %6, undef $w4 + ; CHECKSU(6)%6:gpr32 = SUBWrr undef $w0, undef $w1 + ; CHECKSuccessors: + ; CHECK-NOTSU(8)Ord Latency=0 Cluster + ; CHECKSU(7)%7:gpr64 = ADDXrr undef $x1, undef $x2 + ; CHECKSuccessors: + ; CHECK-NOTSU(9)Ord Latency=0 Cluster + ; CHECKSU(8)dead %8:gpr64 = SUBXrr %7:gpr64, undef $x3 + ; CHECKPredecessors: + ; CHECKSU(7)Ord Latency=0 Cluster + ; CHECKSU(9)dead %9:gpr32 = ADDWrr %6:gpr32, undef $w4 + ; CHECKPredecessors: + ; CHECKSU(6)Ord Latency=0 Cluster +...