Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -44,6 +44,7 @@ FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); +FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIDebuggerInsertNopsPass(); @@ -117,6 +118,9 @@ void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); extern char &SIFixControlFlowLiveIntervalsID; +void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&); +extern char &SIOptimizeExecMaskingPreRAID; + void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); extern char &AMDGPUAnnotateUniformValuesPassID; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -136,6 +136,7 @@ initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); + initializeSIOptimizeExecMaskingPreRAPass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); @@ -769,6 +770,9 @@ } void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { + if (getOptLevel() > CodeGenOpt::None) + insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); + // This needs to be run directly before register allocation because earlier // passes might recompute live intervals. insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -96,6 +96,7 @@ SIMachineScheduler.cpp SIMemoryLegalizer.cpp SIOptimizeExecMasking.cpp + SIOptimizeExecMaskingPreRA.cpp SIPeepholeSDWA.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp Index: lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -0,0 +1,160 @@ +//===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass removes redundant S_OR_B64 instructions enabling lanes in +/// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any +/// vector instructions between them we can only keep outer SI_END_CF, given +/// that CFG is structured and exec bits of the outer end statement are always +/// not less than exec bit of the inner one. +/// +/// This needs to be done before the RA to eliminate saved exec bits registers +/// but after register coalescer to have no vector registers copies in between +/// of different end cf statements. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-optimize-exec-masking-pre-ra" + +namespace { + +class SIOptimizeExecMaskingPreRA : public MachineFunctionPass { +public: + static char ID; + +public: + SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) { + initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI optimize exec mask operations pre-RA"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA, DEBUG_TYPE, + "SI optimize exec mask operations pre-RA", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA, DEBUG_TYPE, + "SI optimize exec mask operations pre-RA", false, false) + +char SIOptimizeExecMaskingPreRA::ID = 0; + +char &llvm::SIOptimizeExecMaskingPreRAID = SIOptimizeExecMaskingPreRA::ID; + +FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() { + return new SIOptimizeExecMaskingPreRA(); +} + +static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) { + return MI.getOpcode() == AMDGPU::S_OR_B64 && + MI.modifiesRegister(AMDGPU::EXEC, TRI); +} + +static bool isFullExecCopy(const MachineInstr& MI) { + switch (MI.getOpcode()) { + default: + break; + case AMDGPU::S_MOV_B64: + case AMDGPU::COPY: + return MI.getOperand(1).isReg() && + MI.getOperand(1).getReg() == AMDGPU::EXEC && + !MI.getOperand(1).getSubReg(); + } + return false; +} + +static unsigned getOrNonExecReg(const MachineInstr &MI, + const SIInstrInfo &TII) { + auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1); + if (Op->isReg() && Op->getReg() != AMDGPU::EXEC) + return Op->getReg(); + Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0); + if (Op->isReg() && Op->getReg() != AMDGPU::EXEC) + return Op->getReg(); + return AMDGPU::NoRegister; +} + +static MachineInstr* getOrExecSource(const MachineInstr &MI, + const SIInstrInfo &TII, + const MachineRegisterInfo &MRI) { + auto SavedExec = getOrNonExecReg(MI, TII); + if (SavedExec == AMDGPU::NoRegister) + return nullptr; + auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec); + if (!SaveExecInst || !isFullExecCopy(*SaveExecInst)) + return nullptr; + return SaveExecInst; +} + +bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + LiveIntervals *LIS = &getAnalysis(); + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + auto Lead = MBB.begin(), E = MBB.end(); + if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI)) + continue; + auto I = std::next(Lead); + + for ( ; I != E; ++I) { + if (!TII->isSALU(*I) || I->readsRegister(AMDGPU::EXEC, TRI) || + I->isBranch()) + break; + } + + if (I != E) + continue; + + const MachineBasicBlock* Succ = *MBB.succ_begin(); + const auto NextLead = Succ->begin(); + if (NextLead == Succ->end() || !isEndCF(*NextLead, TRI) || + !getOrExecSource(*NextLead, *TII, MRI)) + continue; + + DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n'); + + unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII); + LIS->RemoveMachineInstrFromMaps(*Lead); + Lead->eraseFromParent(); + if (SaveExecReg) { + LIS->removeInterval(SaveExecReg); + LIS->createAndComputeVirtRegInterval(SaveExecReg); + } + // Recompute liveness for both reg units of exec. + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC_LO, TRI)); + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC_HI, TRI)); + + Changed = true; + } + + return Changed; +} Index: test/CodeGen/AMDGPU/collapse-endcf.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/collapse-endcf.ll @@ -0,0 +1,188 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}simple_nested_if: +; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]] +; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9_]+]] +; GCN-NEXT: s_cbranch_execz [[ENDIF]] +; GCN: s_and_saveexec_b64 +; GCN-NEXT: ; mask branch [[ENDIF]] +; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN: store_dword +; GCN-NEXT: {{^}}[[ENDIF]]: +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]] +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @simple_nested_if(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = icmp ugt i32 %tmp, 1 + br i1 %tmp1, label %bb.outer.then, label %bb.outer.end + +bb.outer.then: ; preds = %bb + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp + store i32 0, i32 addrspace(1)* %tmp4, align 4 + %tmp5 = icmp eq i32 %tmp, 2 + br i1 %tmp5, label %bb.outer.end, label %bb.inner.then + +bb.inner.then: ; preds = %bb.outer.then + %tmp7 = add i32 %tmp, 1 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp7 + store i32 1, i32 addrspace(1)* %tmp9, align 4 + br label %bb.outer.end + +bb.outer.end: ; preds = %bb.outer.then, %bb.inner.then, %bb + ret void +} + +; GCN-LABEL: {{^}}uncollapsable_nested_if: +; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] +; GCN-NEXT: ; mask branch [[ENDIF_OUTER:BB[0-9_]+]] +; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]] +; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]] +; GCN-NEXT: ; mask branch [[ENDIF_INNER:BB[0-9_]+]] +; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN: store_dword +; GCN-NEXT: {{^}}[[ENDIF_INNER]]: +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER]] +; GCN: store_dword +; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]] +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @uncollapsable_nested_if(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = icmp ugt i32 %tmp, 1 + br i1 %tmp1, label %bb.outer.then, label %bb.outer.end + +bb.outer.then: ; preds = %bb + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp + store i32 0, i32 addrspace(1)* %tmp4, align 4 + %tmp5 = icmp eq i32 %tmp, 2 + br i1 %tmp5, label %bb.inner.end, label %bb.inner.then + +bb.inner.then: ; preds = %bb.outer.then + %tmp7 = add i32 %tmp, 1 + %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp7 + store i32 1, i32 addrspace(1)* %tmp8, align 4 + br label %bb.inner.end + +bb.inner.end: ; preds = %bb.inner.then, %bb.outer.then + %tmp9 = add i32 %tmp, 2 + %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp9 + store i32 2, i32 addrspace(1)* %tmp10, align 4 + br label %bb.outer.end + +bb.outer.end: ; preds = %bb.inner.then, %bb + ret void +} + +; GCN-LABEL: {{^}}nested_if_if_else: +; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] +; GCN-NEXT: ; mask branch [[ENDIF_OUTER:BB[0-9_]+]] +; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]] +; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]] +; GCN-NEXT: s_xor_b64 [[SAVEEXEC_INNER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_INNER]] +; GCN-NEXT: ; mask branch [[THEN_INNER:BB[0-9_]+]] +; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN: store_dword +; GCN-NEXT: {{^}}[[THEN_INNER]]: +; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_INNER3:s\[[0-9:]+\]]], [[SAVEEXEC_INNER2]] +; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_INNER3]] +; GCN-NEXT: ; mask branch [[ENDIF_OUTER]] +; GCN: store_dword +; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]] +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @nested_if_if_else(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp + store i32 0, i32 addrspace(1)* %tmp1, align 4 + %tmp2 = icmp ugt i32 %tmp, 1 + br i1 %tmp2, label %bb.outer.then, label %bb.outer.end + +bb.outer.then: ; preds = %bb + %tmp5 = icmp eq i32 %tmp, 2 + br i1 %tmp5, label %bb.then, label %bb.else + +bb.then: ; preds = %bb.outer.then + %tmp3 = add i32 %tmp, 1 + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp3 + store i32 1, i32 addrspace(1)* %tmp4, align 4 + br label %bb.outer.end + +bb.else: ; preds = %bb.outer.then + %tmp7 = add i32 %tmp, 2 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp7 + store i32 2, i32 addrspace(1)* %tmp9, align 4 + br label %bb.outer.end + +bb.outer.end: ; preds = %bb, %bb.then, %bb.else + ret void +} + +; GCN-LABEL: {{^}}nested_if_else_if: +; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]] +; GCN-NEXT: s_xor_b64 [[SAVEEXEC_OUTER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_OUTER]] +; GCN-NEXT: ; mask branch [[THEN_OUTER:BB[0-9_]+]] +; GCN-NEXT: s_cbranch_execz [[THEN_OUTER]] +; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN: store_dword +; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_ELSE:s\[[0-9:]+\]]] +; GCN-NEXT: ; mask branch [[THEN_OUTER_FLOW:BB[0-9_]+]] +; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN: store_dword +; GCN-NEXT: {{^}}[[THEN_OUTER_FLOW]]: +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_ELSE]] +; GCN-NEXT: {{^}}[[THEN_OUTER]]: +; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_OUTER3:s\[[0-9:]+\]]], [[SAVEEXEC_OUTER2]] +; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_OUTER3]] +; GCN-NEXT: ; mask branch [[ENDIF_OUTER:BB[0-9_]+]] +; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]] +; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN: store_dword +; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_THEN:s\[[0-9:]+\]]] +; GCN-NEXT: ; mask branch [[ENDIF_INNER_OUTER_THEN:BB[0-9_]+]] +; GCN-NEXT: {{^BB[0-9_]+}}: +; GCN: store_dword +; GCN-NEXT: {{^}}[[ENDIF_INNER_OUTER_THEN]]: +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_THEN]] +; GCN-NEXT: {{^}}[[ENDIF_OUTER]]: +; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]] +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp + store i32 0, i32 addrspace(1)* %tmp1, align 4 + %cc1 = icmp ugt i32 %tmp, 1 + br i1 %cc1, label %bb.outer.then, label %bb.outer.else + +bb.outer.then: + %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 1 + store i32 1, i32 addrspace(1)* %tmp2, align 4 + %cc2 = icmp eq i32 %tmp, 2 + br i1 %cc2, label %bb.inner.then, label %bb.outer.end + +bb.inner.then: + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 2 + store i32 2, i32 addrspace(1)* %tmp3, align 4 + br label %bb.outer.end + +bb.outer.else: + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 3 + store i32 3, i32 addrspace(1)* %tmp4, align 4 + %cc3 = icmp eq i32 %tmp, 2 + br i1 %cc3, label %bb.inner.then2, label %bb.outer.end + +bb.inner.then2: + %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 4 + store i32 4, i32 addrspace(1)* %tmp5, align 4 + br label %bb.outer.end + +bb.outer.end: + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone speculatable }