diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -60,6 +60,7 @@ AArch64Subtarget &, AArch64RegisterBankInfo &); FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone); FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone); +FunctionPass *createAArch64PostSelectOptimize(); FunctionPass *createAArch64StackTaggingPass(bool IsOptNone); FunctionPass *createAArch64StackTaggingPreRAPass(); @@ -80,6 +81,7 @@ void initializeAArch64SIMDInstrOptPass(PassRegistry&); void initializeAArch64PreLegalizerCombinerPass(PassRegistry&); void initializeAArch64PostLegalizerCombinerPass(PassRegistry &); +void initializeAArch64PostSelectOptimizePass(PassRegistry &); void initializeAArch64PromoteConstantPass(PassRegistry&); void initializeAArch64RedundantCopyEliminationPass(PassRegistry&); void initializeAArch64StorePairSuppressPass(PassRegistry&); diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -184,6 +184,7 @@ initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64PreLegalizerCombinerPass(*PR); initializeAArch64PostLegalizerCombinerPass(*PR); + initializeAArch64PostSelectOptimizePass(*PR); initializeAArch64PromoteConstantPass(*PR); initializeAArch64RedundantCopyEliminationPass(*PR); initializeAArch64StorePairSuppressPass(*PR); @@ -577,6 +578,8 @@ bool AArch64PassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); + if (getOptLevel() != CodeGenOpt::None) + addPass(createAArch64PostSelectOptimize()); return false; } diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -29,6 +29,7 @@ GISel/AArch64LegalizerInfo.cpp GISel/AArch64PreLegalizerCombiner.cpp GISel/AArch64PostLegalizerCombiner.cpp + GISel/AArch64PostSelectOptimize.cpp GISel/AArch64RegisterBankInfo.cpp AArch64A57FPLoadBalancing.cpp AArch64AdvSIMDScalarPass.cpp diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp @@ -0,0 +1,160 @@ +//=== lib/CodeGen/GlobalISel/AArch64PostSelectOptimize.cpp -- ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass does post-instruction-selection optimizations in the GlobalISel +// pipeline, before the rest of codegen runs. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "aarch64-post-select-optimize" + +using namespace llvm; + +namespace { +class AArch64PostSelectOptimize : public MachineFunctionPass { +public: + static char ID; + + AArch64PostSelectOptimize(); + + StringRef getPassName() const override { + return "AArch64 Post Select Optimizer"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + bool optimizeFmcpSelRanges(MachineBasicBlock &MBB); +}; +} // end anonymous namespace + +void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + MachineFunctionPass::getAnalysisUsage(AU); +} + +AArch64PostSelectOptimize::AArch64PostSelectOptimize() + : MachineFunctionPass(ID) { + initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry()); +} + +unsigned getNonFlagSettingVariant(unsigned Opc) { + switch (Opc) { + default: + return 0; + case AArch64::SUBSXrr: + return AArch64::SUBXrr; + case AArch64::SUBSWrr: + return AArch64::SUBWrr; + case AArch64::SUBSXrs: + return AArch64::SUBXrs; + case AArch64::SUBSXri: + return AArch64::SUBXri; + } +} + +bool AArch64PostSelectOptimize::optimizeFmcpSelRanges(MachineBasicBlock &MBB) { + // Consider the following code: + // FCMPSrr %0, %1, implicit-def $nzcv + // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv + // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv + // FCMPSrr %0, %1, implicit-def $nzcv + // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv + // This kind of code where we have 2 FCMPs each feeding a CSEL can happen + // when we have a single IR fcmp being used by two selects. During selection, + // to ensure that there can be no clobbering of nzcv between the fcmp and the + // csel, we have to generate an fcmp immediately before each csel is + // selected. + // However, often we can essentially CSE these together later in MachineCSE. + // This doesn't work though if there are unrelated flag-setting instructions + // in between the two FCMPs. In this case, the SUBS defines NZCV + // but it doesn't have any users, being overwritten by the second FCMP. + // + // Our solution here is to try to convert flag setting operations between + // a interval of identical FCMPs, so that CSE will be able to eliminate one. + bool Changed = false; + const auto *TII = MBB.getParent()->getSubtarget().getInstrInfo(); + + // The first step is to find the first and last FCMPs. If we have found + // at least two, then set the limit of the bottom-up walk to the first FCMP + // found since we're only interested in dealing with instructions between + // them. + MachineInstr *FirstCmp = nullptr, *LastCmp = nullptr; + for (auto &MI : instructionsWithoutDebug(MBB.begin(), MBB.end())) { + if (MI.getOpcode() == AArch64::FCMPSrr || + MI.getOpcode() == AArch64::FCMPDrr) { + if (!FirstCmp) + FirstCmp = &MI; + else + LastCmp = &MI; + } + } + + if (!LastCmp) + return false; + + LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo()); + LRU.addLiveOuts(MBB); + bool NZCVDead = LRU.available(AArch64::NZCV); + for (auto II = MachineBasicBlock::iterator(LastCmp); &*II != FirstCmp; --II) { + LRU.stepBackward(*II); + // Did this instruction define NZCV? + bool NZCVDeadAtCurrInstr = LRU.available(AArch64::NZCV); + if (NZCVDead && NZCVDeadAtCurrInstr && II->definesRegister(AArch64::NZCV)) { + // If we have a def and NZCV is dead, then we can convert this op. + unsigned NewOpc = getNonFlagSettingVariant(II->getOpcode()); + int DeadNZCVIdx = II->findRegisterDefOperandIdx(AArch64::NZCV); + if (NewOpc && DeadNZCVIdx != -1) { + II->setDesc(TII->get(NewOpc)); + II->RemoveOperand(DeadNZCVIdx); + Changed |= true; + } + } + + NZCVDead = NZCVDeadAtCurrInstr; + } + return Changed; +} + +bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + + bool Changed = false; + for (auto &BB : MF) { + Changed |= optimizeFmcpSelRanges(BB); + } + return true; +} + +char AArch64PostSelectOptimize::ID = 0; +INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE, + "Optimize AArch64 selected instructions", + false, false) +INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE, + "Optimize AArch64 selected instructions", false, + false) + +namespace llvm { +FunctionPass *createAArch64PostSelectOptimize() { + return new AArch64PostSelectOptimize(); +} +} // end namespace llvm diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll b/llvm/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll @@ -67,6 +67,7 @@ ; VERIFY-O0-NEXT: Verify generated machine code ; ENABLED-NEXT: Analysis for ComputingKnownBits ; ENABLED-NEXT: InstructionSelect +; ENABLED-O1-NEXT: AArch64 Post Select Optimizer ; VERIFY-NEXT: Verify generated machine code ; ENABLED-NEXT: ResetMachineFunction diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postselectopt-dead-cc-defs-in-fcmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postselectopt-dead-cc-defs-in-fcmp.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postselectopt-dead-cc-defs-in-fcmp.mir @@ -0,0 +1,138 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=aarch64-post-select-optimize -verify-machineinstrs %s -o - | FileCheck %s +--- +name: test_fcmp_dead_cc +alignment: 4 +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +liveins: + - { reg: '$x0' } + - { reg: '$w1' } +body: | + bb.1: + liveins: $w1, $x0, $s0, $s1 + + ; CHECK-LABEL: name: test_fcmp_dead_cc + ; CHECK: liveins: $w1, $x0, $s0, $s1 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:fpr32 = COPY $s0 + ; CHECK: [[COPY3:%[0-9]+]]:fpr32 = COPY $s1 + ; CHECK: [[COPY4:%[0-9]+]]:gpr32 = COPY $wzr + ; CHECK: FCMPSrr [[COPY2]], [[COPY3]], implicit-def $nzcv + ; CHECK: [[SUBWrr:%[0-9]+]]:gpr32 = SUBWrr [[COPY1]], [[COPY4]] + ; CHECK: FCMPSrr [[COPY2]], [[COPY3]], implicit-def $nzcv + ; CHECK: [[UBFMWri:%[0-9]+]]:gpr32common = UBFMWri [[SUBWrr]], 1, 31 + ; CHECK: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 1 + ; CHECK: [[CSELWr:%[0-9]+]]:gpr32 = CSELWr [[UBFMWri]], [[MOVi32imm]], 8, implicit $nzcv + ; CHECK: $w0 = COPY [[CSELWr]] + ; CHECK: RET_ReallyLR implicit $w0 + %1:gpr64 = COPY $x0 + %2:gpr32 = COPY $w1 + %3:fpr32 = COPY $s0 + %4:fpr32 = COPY $s1 + %26:gpr32 = COPY $wzr + FCMPSrr %3, %4, implicit-def $nzcv + %12:gpr32 = SUBSWrr %2, %26, implicit-def $nzcv + FCMPSrr %3, %4, implicit-def $nzcv + %14:gpr32common = UBFMWri %12, 1, 31 + %60:gpr32 = MOVi32imm 1 + %16:gpr32 = CSELWr %14, %60, 8, implicit $nzcv + $w0 = COPY %16 + RET_ReallyLR implicit $w0 + +... +--- +name: test_fcmp_64_dead_cc +alignment: 4 +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +liveins: + - { reg: '$x0' } + - { reg: '$w1' } +body: | + bb.1: + liveins: $w1, $x0, $d0, $d1 + + ; CHECK-LABEL: name: test_fcmp_64_dead_cc + ; CHECK: liveins: $w1, $x0, $d0, $d1 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0 + ; CHECK: [[COPY3:%[0-9]+]]:fpr64 = COPY $d1 + ; CHECK: [[COPY4:%[0-9]+]]:gpr32 = COPY $wzr + ; CHECK: FCMPDrr [[COPY2]], [[COPY3]], implicit-def $nzcv + ; CHECK: [[SUBWrr:%[0-9]+]]:gpr32 = SUBWrr [[COPY1]], [[COPY4]] + ; CHECK: FCMPDrr [[COPY2]], [[COPY3]], implicit-def $nzcv + ; CHECK: [[UBFMWri:%[0-9]+]]:gpr32common = UBFMWri [[SUBWrr]], 1, 31 + ; CHECK: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 1 + ; CHECK: [[CSELWr:%[0-9]+]]:gpr32 = CSELWr [[UBFMWri]], [[MOVi32imm]], 8, implicit $nzcv + ; CHECK: $w0 = COPY [[CSELWr]] + ; CHECK: RET_ReallyLR implicit $w0 + %1:gpr64 = COPY $x0 + %2:gpr32 = COPY $w1 + %3:fpr64 = COPY $d0 + %4:fpr64 = COPY $d1 + %26:gpr32 = COPY $wzr + FCMPDrr %3, %4, implicit-def $nzcv + %12:gpr32 = SUBSWrr %2, %26, implicit-def $nzcv + FCMPDrr %3, %4, implicit-def $nzcv + %14:gpr32common = UBFMWri %12, 1, 31 + %60:gpr32 = MOVi32imm 1 + %16:gpr32 = CSELWr %14, %60, 8, implicit $nzcv + $w0 = COPY %16 + RET_ReallyLR implicit $w0 + +... +--- +name: test_fcmp_dead_cc_3_fcmps +alignment: 4 +legalized: true +regBankSelected: true +selected: true +tracksRegLiveness: true +liveins: + - { reg: '$x0' } + - { reg: '$w1' } +body: | + bb.1: + liveins: $w1, $x0, $s0, $s1 + + ; CHECK-LABEL: name: test_fcmp_dead_cc_3_fcmps + ; CHECK: liveins: $w1, $x0, $s0, $s1 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:fpr32 = COPY $s0 + ; CHECK: [[COPY3:%[0-9]+]]:fpr32 = COPY $s1 + ; CHECK: [[COPY4:%[0-9]+]]:gpr32 = COPY $wzr + ; CHECK: FCMPSrr [[COPY2]], [[COPY3]], implicit-def $nzcv + ; CHECK: [[SUBWrr:%[0-9]+]]:gpr32 = SUBWrr [[COPY1]], [[COPY4]] + ; CHECK: FCMPSrr [[COPY2]], [[COPY3]], implicit-def $nzcv + ; CHECK: [[SUBWrr1:%[0-9]+]]:gpr32 = SUBWrr [[COPY1]], [[COPY4]] + ; CHECK: FCMPSrr [[COPY2]], [[COPY3]], implicit-def $nzcv + ; CHECK: [[UBFMWri:%[0-9]+]]:gpr32common = UBFMWri [[SUBWrr1]], 1, 31 + ; CHECK: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 1 + ; CHECK: [[CSELWr:%[0-9]+]]:gpr32 = CSELWr [[UBFMWri]], [[MOVi32imm]], 8, implicit $nzcv + ; CHECK: $w0 = COPY [[CSELWr]] + ; CHECK: RET_ReallyLR implicit $w0 + %1:gpr64 = COPY $x0 + %2:gpr32 = COPY $w1 + %3:fpr32 = COPY $s0 + %4:fpr32 = COPY $s1 + %26:gpr32 = COPY $wzr + FCMPSrr %3, %4, implicit-def $nzcv + %12:gpr32 = SUBSWrr %2, %26, implicit-def $nzcv + FCMPSrr %3, %4, implicit-def $nzcv + %12:gpr32 = SUBSWrr %2, %26, implicit-def $nzcv + FCMPSrr %3, %4, implicit-def $nzcv + %14:gpr32common = UBFMWri %12, 1, 31 + %60:gpr32 = MOVi32imm 1 + %16:gpr32 = CSELWr %14, %60, 8, implicit $nzcv + $w0 = COPY %16 + RET_ReallyLR implicit $w0 + +... diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn @@ -130,6 +130,7 @@ "GISel/AArch64LegalizerInfo.cpp", "GISel/AArch64PostLegalizerCombiner.cpp", "GISel/AArch64PreLegalizerCombiner.cpp", + "GISel/AArch64PostSelectOptimize.cpp" "GISel/AArch64RegisterBankInfo.cpp", "SVEIntrinsicOpts.cpp", ]