diff --git a/llvm/include/llvm/Analysis/DivergenceAnalysis.h b/llvm/include/llvm/Analysis/DivergenceAnalysis.h --- a/llvm/include/llvm/Analysis/DivergenceAnalysis.h +++ b/llvm/include/llvm/Analysis/DivergenceAnalysis.h @@ -174,7 +174,7 @@ } /// Whether \p V is uniform/non-divergent. - bool isUniform(const Value &V) const { return !isDivergent(V); } + bool isUniform(const Value *V) const { return !isDivergent(*V); } /// Whether \p U is uniform/non-divergent. Uses of a uniform value can be /// divergent. diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -323,7 +323,7 @@ void initializeGCNCreateVOPDPass(PassRegistry &); extern char &GCNCreateVOPDID; -void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); +void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry &); extern char &AMDGPUUnifyDivergentExitNodesID; ImmutablePass *createAMDGPUAAWrapperPass(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -21,6 +21,7 @@ #include "AMDGPUMacroFusion.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" +#include "AMDGPUUnifyDivergentExitNodes.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" #include "GCNVOPDUtils.h" @@ -647,6 +648,10 @@ PM.addPass(AMDGPUPromoteKernelArgumentsPass()); return true; } + if (PassName == "amdgpu-unify-divergent-exit-nodes") { + PM.addPass(AMDGPUUnifyDivergentExitNodesPass()); + return true; + } return false; }); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h @@ -0,0 +1,61 @@ +//===- AMDGPUUnifyDivergentExitNodes.h ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring +// there is at most one ret and one unreachable instruction, it ensures there is +// at most one divergent exiting block. +// +// StructurizeCFG can't deal with multi-exit regions formed by branches to +// multiple return nodes. It is not desirable to structurize regions with +// uniform branches, so unifying those to the same return block as divergent +// branches inhibits use of scalar branching. It still can't deal with the case +// where one branch goes to return, and one unreachable. Replace unreachable in +// this case with a return. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/TargetTransformInfo.h" + +namespace llvm { + +class AMDGPUUnifyDivergentExitNodesImpl { +protected: + const TargetTransformInfo *TTI = nullptr; + +public: + // We can preserve non-critical-edgeness when we unify function exit nodes + BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, + ArrayRef ReturningBlocks, + StringRef Name); + template + bool run(Function &F, DominatorTree *DT, PostDominatorTree *PDT, T *DA); +}; + +class AMDGPUUnifyDivergentExitNodes : public FunctionPass, + public AMDGPUUnifyDivergentExitNodesImpl { +public: + static char ID; + AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) { + initializeAMDGPUUnifyDivergentExitNodesPass( + *PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; +}; + +class AMDGPUUnifyDivergentExitNodesPass + : public PassInfoMixin, + public AMDGPUUnifyDivergentExitNodesImpl { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -20,11 +20,13 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUUnifyDivergentExitNodes.h" #include "SIDefines.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/PostDominators.h" @@ -39,6 +41,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -51,42 +54,19 @@ #define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes" -namespace { - -class AMDGPUUnifyDivergentExitNodes : public FunctionPass { -private: - const TargetTransformInfo *TTI = nullptr; - -public: - static char ID; // Pass identification, replacement for typeid - - AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) { - initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry()); - } - - // We can preserve non-critical-edgeness when we unify function exit nodes - void getAnalysisUsage(AnalysisUsage &AU) const override; - BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, - ArrayRef ReturningBlocks, - StringRef Name); - bool runOnFunction(Function &F) override; -}; - -} // end anonymous namespace - char AMDGPUUnifyDivergentExitNodes::ID = 0; char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID; INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, - "Unify divergent function exit nodes", false, false) + "Unify divergent function exit nodes", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) -void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ +void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const { if (RequireAndPreserveDomTree) AU.addRequired(); @@ -114,14 +94,13 @@ /// \returns true if \p BB is reachable through only uniform branches. /// XXX - Is there a more efficient way to find this? -static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, - BasicBlock &BB) { +template static bool isUniformlyReached(T *DA, BasicBlock &BB) { SmallVector Stack(predecessors(&BB)); SmallPtrSet Visited; while (!Stack.empty()) { BasicBlock *Top = Stack.pop_back_val(); - if (!DA.isUniform(Top->getTerminator())) + if (!DA->isUniform(Top->getTerminator())) return false; for (BasicBlock *Pred : predecessors(Top)) { @@ -133,7 +112,7 @@ return true; } -BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet( +BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( Function &F, DomTreeUpdater &DTU, ArrayRef ReturningBlocks, StringRef Name) { // Otherwise, we need to insert a new basic block into the function, add a PHI @@ -181,20 +160,9 @@ return NewRetBlock; } -bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { - DominatorTree *DT = nullptr; - if (RequireAndPreserveDomTree) - DT = &getAnalysis().getDomTree(); - - auto &PDT = getAnalysis().getPostDomTree(); - if (PDT.root_size() == 0 || - (PDT.root_size() == 1 && - !isa(PDT.getRoot()->getTerminator()))) - return false; - - LegacyDivergenceAnalysis &DA = getAnalysis(); - TTI = &getAnalysis().getTTI(F); - +template +bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, + PostDominatorTree *PDT, T *DA) { // Loop over all of the blocks in a function, tracking all of the blocks that // return. SmallVector ReturningBlocks; @@ -213,9 +181,9 @@ // exits, we should only unify UnreachableBlocks that are not uniformly // reachable. bool HasDivergentExitBlock = llvm::any_of( - PDT.roots(), [&](auto BB) { return !isUniformlyReached(DA, *BB); }); + PDT->roots(), [&](auto BB) { return !isUniformlyReached(DA, *BB); }); - for (BasicBlock *BB : PDT.roots()) { + for (BasicBlock *BB : PDT->roots()) { if (isa(BB->getTerminator())) { if (HasDivergentExitBlock) ReturningBlocks.push_back(BB); @@ -327,3 +295,29 @@ unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock"); return true; } + +bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { + DominatorTree *DT = nullptr; + if (RequireAndPreserveDomTree) + DT = &getAnalysis().getDomTree(); + + auto &PDT = getAnalysis().getPostDomTree(); + LegacyDivergenceAnalysis &DA = getAnalysis(); + TTI = &getAnalysis().getTTI(F); + return AMDGPUUnifyDivergentExitNodes::run(F, DT, &PDT, &DA); +} + +PreservedAnalyses +AMDGPUUnifyDivergentExitNodesPass::run(Function &F, + FunctionAnalysisManager &AM) { + DominatorTree *DT = nullptr; + if (RequireAndPreserveDomTree) + DT = &AM.getResult(F); + + auto *PDT = &AM.getResult(F); + auto *DA = &AM.getResult(F); + TTI = &AM.getResult(F); + return AMDGPUUnifyDivergentExitNodesImpl::run(F, DT, PDT, DA) + ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -1,36 +1,48 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -p simplifycfg,amdgpu-unify-divergent-exit-nodes %s -S -o - | FileCheck %s --check-prefix=OPT +; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA define void @nested_inf_loop(i1 %0, i1 %1) { -; CHECK-LABEL: nested_inf_loop: -; CHECK-NEXT: %bb.0: ; %BB -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 1, v1 -; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK-NEXT: s_xor_b64 s[6:7], vcc, -1 -; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: .LBB0_1: ; %BB1 -; CHECK: s_and_b64 s[10:11], exec, s[6:7] -; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 -; CHECK-NEXT: %bb.2: ; %BB2 -; CHECK: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: .LBB0_3: ; %BB4 -; CHECK: s_and_b64 s[10:11], exec, s[4:5] -; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_cbranch_execnz .LBB0_3 -; CHECK-NEXT: %bb.4: ; %loop.exit.guard -; CHECK: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_mov_b64 vcc, 0 -; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: s_branch .LBB0_1 -; CHECK-NEXT: %bb.5: ; %DummyReturnBlock -; CHECK-NEXT: s_setpc_b64 s[30:31] +; OPT-LABEL: @nested_inf_loop( +; OPT-NEXT: BB: +; OPT-NEXT: br label [[BB1:%.*]] +; OPT: BB1: +; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]] +; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]] +; OPT: infloop: +; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]] +; OPT: DummyReturnBlock: +; OPT-NEXT: ret void +; +; ISA-LABEL: nested_inf_loop: +; ISA-NEXT: %bb.0: ; %BB +; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ISA-NEXT: v_and_b32_e32 v1, 1, v1 +; ISA-NEXT: v_and_b32_e32 v0, 1, v0 +; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; ISA-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; ISA-NEXT: s_xor_b64 s[6:7], vcc, -1 +; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: .LBB0_1: ; %BB1 +; ISA: s_and_b64 s[10:11], exec, s[6:7] +; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9] +; ISA-NEXT: s_cbranch_execnz .LBB0_1 +; ISA-NEXT: %bb.2: ; %BB2 +; ISA: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: .LBB0_3: ; %BB4 +; ISA: s_and_b64 s[10:11], exec, s[4:5] +; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9] +; ISA-NEXT: s_cbranch_execnz .LBB0_3 +; ISA-NEXT: %bb.4: ; %loop.exit.guard +; ISA: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_mov_b64 vcc, 0 +; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: s_branch .LBB0_1 +; ISA-NEXT: %bb.5: ; %DummyReturnBlock +; ISA-NEXT: s_setpc_b64 s[30:31] BB: br label %BB1