diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -323,9 +323,14 @@ void initializeGCNCreateVOPDPass(PassRegistry &); extern char &GCNCreateVOPDID; -void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); +void initializeAMDGPUUnifyDivergentExitNodesLegacyPass(PassRegistry &); extern char &AMDGPUUnifyDivergentExitNodesID; +struct AMDGPUUnifyDivergentExitNodesPass + : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + ImmutablePass *createAMDGPUAAWrapperPass(); void initializeAMDGPUAAWrapperPassPass(PassRegistry&); ImmutablePass *createAMDGPUExternalAAWrapperPass(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -400,7 +400,7 @@ initializeSIFormMemoryClausesPass(*PR); initializeSIPostRABundlerPass(*PR); initializeGCNCreateVOPDPass(*PR); - initializeAMDGPUUnifyDivergentExitNodesPass(*PR); + initializeAMDGPUUnifyDivergentExitNodesLegacyPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); initializeAMDGPUUseNativeCallsPass(*PR); @@ -647,6 +647,10 @@ PM.addPass(AMDGPUPromoteKernelArgumentsPass()); return true; } + if (PassName == "amdgpu-unify-divergent-exit-nodes") { + PM.addPass(AMDGPUUnifyDivergentExitNodesPass()); + return true; + } return false; }); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/PostDominators.h" @@ -39,6 +40,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -53,45 +55,44 @@ namespace { -class AMDGPUUnifyDivergentExitNodes : public FunctionPass { +class AMDGPUUnifyDivergentExitNodesLegacy : public FunctionPass { private: - const TargetTransformInfo *TTI = nullptr; + const TargetTransformInfo *TTI; public: static char ID; // Pass identification, replacement for typeid - AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) { - initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry()); + AMDGPUUnifyDivergentExitNodesLegacy() : FunctionPass(ID) { + initializeAMDGPUUnifyDivergentExitNodesLegacyPass( + *PassRegistry::getPassRegistry()); } // We can preserve non-critical-edgeness when we unify function exit nodes void getAnalysisUsage(AnalysisUsage &AU) const override; - BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, - ArrayRef ReturningBlocks, - StringRef Name); bool runOnFunction(Function &F) override; }; } // end anonymous namespace -char AMDGPUUnifyDivergentExitNodes::ID = 0; +char AMDGPUUnifyDivergentExitNodesLegacy::ID = 0; -char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID; +char &llvm::AMDGPUUnifyDivergentExitNodesID = + AMDGPUUnifyDivergentExitNodesLegacy::ID; -INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, - "Unify divergent function exit nodes", false, false) +INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodesLegacy, DEBUG_TYPE, + "Unify divergent function exit nodes", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) -INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, +INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodesLegacy, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) -void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ +void AMDGPUUnifyDivergentExitNodesLegacy::getAnalysisUsage( + AnalysisUsage &AU) const { if (RequireAndPreserveDomTree) AU.addRequired(); AU.addRequired(); - AU.addRequired(); if (RequireAndPreserveDomTree) { @@ -114,14 +115,14 @@ /// \returns true if \p BB is reachable through only uniform branches. /// XXX - Is there a more efficient way to find this? -static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, - BasicBlock &BB) { +static bool isUniformlyReached(BasicBlock &BB, + std::function IsUniform) { SmallVector Stack(predecessors(&BB)); SmallPtrSet Visited; while (!Stack.empty()) { BasicBlock *Top = Stack.pop_back_val(); - if (!DA.isUniform(Top->getTerminator())) + if (!IsUniform(Top->getTerminator())) return false; for (BasicBlock *Pred : predecessors(Top)) { @@ -133,9 +134,10 @@ return true; } -BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet( - Function &F, DomTreeUpdater &DTU, ArrayRef ReturningBlocks, - StringRef Name) { +BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, + ArrayRef ReturningBlocks, + StringRef Name, + const TargetTransformInfo *TTI) { // Otherwise, we need to insert a new basic block into the function, add a PHI // nodes (if the function returns values), and convert all of the return // instructions into unconditional branches. @@ -181,20 +183,15 @@ return NewRetBlock; } -bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { - DominatorTree *DT = nullptr; - if (RequireAndPreserveDomTree) - DT = &getAnalysis().getDomTree(); - - auto &PDT = getAnalysis().getPostDomTree(); - if (PDT.root_size() == 0 || - (PDT.root_size() == 1 && - !isa(PDT.getRoot()->getTerminator()))) +bool unifyDivergentExitNodesImpl(Function &F, DominatorTree *DT, + PostDominatorTree *PDT, + const TargetTransformInfo *TTI, + std::function IsUniform) { + if (PDT->root_size() == 0 || + (PDT->root_size() == 1 && + !isa(PDT->getRoot()->getTerminator()))) return false; - LegacyDivergenceAnalysis &DA = getAnalysis(); - TTI = &getAnalysis().getTTI(F); - // Loop over all of the blocks in a function, tracking all of the blocks that // return. SmallVector ReturningBlocks; @@ -212,10 +209,11 @@ // function exits. After structurizer is able to handle multiple function // exits, we should only unify UnreachableBlocks that are not uniformly // reachable. - bool HasDivergentExitBlock = llvm::any_of( - PDT.roots(), [&](auto BB) { return !isUniformlyReached(DA, *BB); }); + bool HasDivergentExitBlock = llvm::any_of(PDT->roots(), [&](auto BB) { + return !isUniformlyReached(*BB, IsUniform); + }); - for (BasicBlock *BB : PDT.roots()) { + for (BasicBlock *BB : PDT->roots()) { if (isa(BB->getTerminator())) { if (HasDivergentExitBlock) ReturningBlocks.push_back(BB); @@ -324,6 +322,36 @@ if (ReturningBlocks.size() == 1) return Changed; // Already has a single return block - unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock"); + unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock", TTI); return true; } + +bool AMDGPUUnifyDivergentExitNodesLegacy::runOnFunction(Function &F) { + DominatorTree *DT = nullptr; + if (RequireAndPreserveDomTree) + DT = &getAnalysis().getDomTree(); + + auto &PDT = getAnalysis().getPostDomTree(); + LegacyDivergenceAnalysis &DA = getAnalysis(); + TTI = &getAnalysis().getTTI(F); + + auto IsUniform = [&](Value *V) { return DA.isUniform(V); }; + return unifyDivergentExitNodesImpl(F, DT, &PDT, TTI, IsUniform); +} + +PreservedAnalyses +AMDGPUUnifyDivergentExitNodesPass::run(Function &F, + FunctionAnalysisManager &AM) { + DominatorTree *DT = nullptr; + if (RequireAndPreserveDomTree) + DT = &AM.getResult(F); + + auto *PDT = &AM.getResult(F); + auto *DA = &AM.getResult(F); + auto *TTI = &AM.getResult(F); + auto IsUniform = [&](Value *V) { return DA->isUniform(*V); }; + + return unifyDivergentExitNodesImpl(F, DT, PDT, TTI, IsUniform) + ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -1,36 +1,48 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -p simplifycfg,amdgpu-unify-divergent-exit-nodes %s -S -o - | FileCheck %s --check-prefix=OPT +; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA define void @nested_inf_loop(i1 %0, i1 %1) { -; CHECK-LABEL: nested_inf_loop: -; CHECK-NEXT: %bb.0: ; %BB -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 1, v1 -; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK-NEXT: s_xor_b64 s[6:7], vcc, -1 -; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: .LBB0_1: ; %BB1 -; CHECK: s_and_b64 s[10:11], exec, s[6:7] -; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 -; CHECK-NEXT: %bb.2: ; %BB2 -; CHECK: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: .LBB0_3: ; %BB4 -; CHECK: s_and_b64 s[10:11], exec, s[4:5] -; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_cbranch_execnz .LBB0_3 -; CHECK-NEXT: %bb.4: ; %loop.exit.guard -; CHECK: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_mov_b64 vcc, 0 -; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: s_branch .LBB0_1 -; CHECK-NEXT: %bb.5: ; %DummyReturnBlock -; CHECK-NEXT: s_setpc_b64 s[30:31] +; OPT-LABEL: @nested_inf_loop( +; OPT-NEXT: BB: +; OPT-NEXT: br label [[BB1:%.*]] +; OPT: BB1: +; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]] +; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]] +; OPT: infloop: +; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]] +; OPT: DummyReturnBlock: +; OPT-NEXT: ret void +; +; ISA-LABEL: nested_inf_loop: +; ISA-NEXT: %bb.0: ; %BB +; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ISA-NEXT: v_and_b32_e32 v1, 1, v1 +; ISA-NEXT: v_and_b32_e32 v0, 1, v0 +; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; ISA-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; ISA-NEXT: s_xor_b64 s[6:7], vcc, -1 +; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: .LBB0_1: ; %BB1 +; ISA: s_and_b64 s[10:11], exec, s[6:7] +; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9] +; ISA-NEXT: s_cbranch_execnz .LBB0_1 +; ISA-NEXT: %bb.2: ; %BB2 +; ISA: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: .LBB0_3: ; %BB4 +; ISA: s_and_b64 s[10:11], exec, s[4:5] +; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9] +; ISA-NEXT: s_cbranch_execnz .LBB0_3 +; ISA-NEXT: %bb.4: ; %loop.exit.guard +; ISA: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_mov_b64 vcc, 0 +; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: s_branch .LBB0_1 +; ISA-NEXT: %bb.5: ; %DummyReturnBlock +; ISA-NEXT: s_setpc_b64 s[30:31] BB: br label %BB1