diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -22,6 +22,7 @@ #include "AMDGPURegBankSelect.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" +#include "AMDGPUUnifyDivergentExitNodes.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" #include "GCNVOPDUtils.h" @@ -655,6 +656,10 @@ PM.addPass(AMDGPUPromoteKernelArgumentsPass()); return true; } + if (PassName == "amdgpu-unify-divergent-exit-nodes") { + PM.addPass(AMDGPUUnifyDivergentExitNodesPass()); + return true; + } return false; }); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h @@ -0,0 +1,31 @@ +//===- AMDGPUUnifyDivergentExitNodes.h ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring +// there is at most one ret and one unreachable instruction, it ensures there is +// at most one divergent exiting block. +// +// StructurizeCFG can't deal with multi-exit regions formed by branches to +// multiple return nodes. It is not desirable to structurize regions with +// uniform branches, so unifying those to the same return block as divergent +// branches inhibits use of scalar branching. It still can't deal with the case +// where one branch goes to return, and one unreachable. Replace unreachable in +// this case with a return. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" + +namespace llvm { +class AMDGPUUnifyDivergentExitNodesPass + : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -19,6 +19,7 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPUUnifyDivergentExitNodes.h" #include "AMDGPU.h" #include "SIDefines.h" #include "llvm/ADT/ArrayRef.h" @@ -53,25 +54,33 @@ namespace { -class AMDGPUUnifyDivergentExitNodes : public FunctionPass { +class AMDGPUUnifyDivergentExitNodesImpl { private: const TargetTransformInfo *TTI = nullptr; public: - static char ID; // Pass identification, replacement for typeid - - AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) { - initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry()); - } + AMDGPUUnifyDivergentExitNodesImpl() = delete; + AMDGPUUnifyDivergentExitNodesImpl(const TargetTransformInfo *TTI) + : TTI(TTI) {} // We can preserve non-critical-edgeness when we unify function exit nodes - void getAnalysisUsage(AnalysisUsage &AU) const override; BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, ArrayRef ReturningBlocks, StringRef Name); - bool runOnFunction(Function &F) override; + bool run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, + const UniformityInfo &UA); }; +class AMDGPUUnifyDivergentExitNodes : public FunctionPass { +public: + static char ID; + AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) { + initializeAMDGPUUnifyDivergentExitNodesPass( + *PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; +}; } // end anonymous namespace char AMDGPUUnifyDivergentExitNodes::ID = 0; @@ -79,14 +88,14 @@ char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID; INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, - "Unify divergent function exit nodes", false, false) + "Unify divergent function exit nodes", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) -void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ +void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const { if (RequireAndPreserveDomTree) AU.addRequired(); @@ -132,7 +141,7 @@ return true; } -BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet( +BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( Function &F, DomTreeUpdater &DTU, ArrayRef ReturningBlocks, StringRef Name) { // Otherwise, we need to insert a new basic block into the function, add a PHI @@ -180,21 +189,14 @@ return NewRetBlock; } -bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { - DominatorTree *DT = nullptr; - if (RequireAndPreserveDomTree) - DT = &getAnalysis().getDomTree(); - - auto &PDT = getAnalysis().getPostDomTree(); +bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, + const PostDominatorTree &PDT, + const UniformityInfo &UA) { if (PDT.root_size() == 0 || (PDT.root_size() == 1 && !isa(PDT.getRoot()->getTerminator()))) return false; - UniformityInfo &UA = - getAnalysis().getUniformityInfo(); - TTI = &getAnalysis().getTTI(F); - // Loop over all of the blocks in a function, tracking all of the blocks that // return. SmallVector ReturningBlocks; @@ -327,3 +329,30 @@ unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock"); return true; } + +bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { + DominatorTree *DT = nullptr; + if (RequireAndPreserveDomTree) + DT = &getAnalysis().getDomTree(); + const auto &PDT = + getAnalysis().getPostDomTree(); + const auto &UA = getAnalysis().getUniformityInfo(); + const auto *TranformInfo = + &getAnalysis().getTTI(F); + return AMDGPUUnifyDivergentExitNodesImpl(TranformInfo).run(F, DT, PDT, UA); +} + +PreservedAnalyses +AMDGPUUnifyDivergentExitNodesPass::run(Function &F, + FunctionAnalysisManager &AM) { + DominatorTree *DT = nullptr; + if (RequireAndPreserveDomTree) + DT = &AM.getResult(F); + + const auto &PDT = AM.getResult(F); + const auto &UA = AM.getResult(F); + const auto *TransformInfo = &AM.getResult(F); + return AMDGPUUnifyDivergentExitNodesImpl(TransformInfo).run(F, DT, PDT, UA) + ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -1,36 +1,48 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -p simplifycfg,amdgpu-unify-divergent-exit-nodes %s -S -o - | FileCheck %s --check-prefix=OPT +; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA define void @nested_inf_loop(i1 %0, i1 %1) { -; CHECK-LABEL: nested_inf_loop: -; CHECK-NEXT: %bb.0: ; %BB -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 1, v1 -; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK-NEXT: s_xor_b64 s[6:7], vcc, -1 -; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: .LBB0_1: ; %BB1 -; CHECK: s_and_b64 s[10:11], exec, s[6:7] -; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 -; CHECK-NEXT: %bb.2: ; %BB2 -; CHECK: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: .LBB0_3: ; %BB4 -; CHECK: s_and_b64 s[10:11], exec, s[4:5] -; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_cbranch_execnz .LBB0_3 -; CHECK-NEXT: %bb.4: ; %loop.exit.guard -; CHECK: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_mov_b64 vcc, 0 -; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: s_branch .LBB0_1 -; CHECK-NEXT: %bb.5: ; %DummyReturnBlock -; CHECK-NEXT: s_setpc_b64 s[30:31] +; OPT-LABEL: @nested_inf_loop( +; OPT-NEXT: BB: +; OPT-NEXT: br label [[BB1:%.*]] +; OPT: BB1: +; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]] +; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]] +; OPT: infloop: +; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]] +; OPT: DummyReturnBlock: +; OPT-NEXT: ret void +; +; ISA-LABEL: nested_inf_loop: +; ISA-NEXT: %bb.0: ; %BB +; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ISA-NEXT: v_and_b32_e32 v1, 1, v1 +; ISA-NEXT: v_and_b32_e32 v0, 1, v0 +; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; ISA-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; ISA-NEXT: s_xor_b64 s[6:7], vcc, -1 +; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: .LBB0_1: ; %BB1 +; ISA: s_and_b64 s[10:11], exec, s[6:7] +; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9] +; ISA-NEXT: s_cbranch_execnz .LBB0_1 +; ISA-NEXT: %bb.2: ; %BB2 +; ISA: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: .LBB0_3: ; %BB4 +; ISA: s_and_b64 s[10:11], exec, s[4:5] +; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9] +; ISA-NEXT: s_cbranch_execnz .LBB0_3 +; ISA-NEXT: %bb.4: ; %loop.exit.guard +; ISA: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_mov_b64 vcc, 0 +; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: s_branch .LBB0_1 +; ISA-NEXT: %bb.5: ; %DummyReturnBlock +; ISA-NEXT: s_setpc_b64 s[30:31] BB: br label %BB1