diff --git a/llvm/include/llvm/Analysis/LegacyDivergenceAnalysis.h b/llvm/include/llvm/Analysis/LegacyDivergenceAnalysis.h --- a/llvm/include/llvm/Analysis/LegacyDivergenceAnalysis.h +++ b/llvm/include/llvm/Analysis/LegacyDivergenceAnalysis.h @@ -16,6 +16,9 @@ #define LLVM_ANALYSIS_LEGACYDIVERGENCEANALYSIS_H #include "llvm/ADT/DenseSet.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/IR/PassManager.h" #include "llvm/Pass.h" #include @@ -28,19 +31,8 @@ class Use; class Value; -class LegacyDivergenceAnalysis : public FunctionPass { +class LegacyDivergenceAnalysisImpl { public: - static char ID; - - LegacyDivergenceAnalysis(); - - void getAnalysisUsage(AnalysisUsage &AU) const override; - - bool runOnFunction(Function &F) override; - - // Print all divergent branches in the function. - void print(raw_ostream &OS, const Module *) const override; - // Returns true if V is divergent at its definition. bool isDivergent(const Value *V) const; @@ -57,11 +49,45 @@ // Keep the analysis results uptodate by removing an erased value. void removeValue(const Value *V) { DivergentValues.erase(V); } -private: + // Print all divergent branches in the function. + void print(raw_ostream &OS, const Module *) const; + // Whether analysis should be performed by GPUDivergenceAnalysis. bool shouldUseGPUDivergenceAnalysis(const Function &F, - const TargetTransformInfo &TTI) const; + const TargetTransformInfo &TTI, + const LoopInfo &LI); + + void run(Function &F, TargetTransformInfo &TTI, DominatorTree &DT, + PostDominatorTree &PDT, const LoopInfo &LI); + +protected: + // (optional) handle to new DivergenceAnalysis + std::unique_ptr gpuDA; + + // Stores all divergent values. + DenseSet DivergentValues; + + // Stores divergent uses of possibly uniform values. + DenseSet DivergentUses; +}; + +class LegacyDivergenceAnalysis : public FunctionPass, + public LegacyDivergenceAnalysisImpl { +public: + static char ID; + LegacyDivergenceAnalysis(); + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; +}; + +class LegacyDivergenceAnalysisPass + : public PassInfoMixin, + public LegacyDivergenceAnalysisImpl { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + +private: // (optional) handle to new DivergenceAnalysis std::unique_ptr gpuDA; @@ -71,6 +97,7 @@ // Stores divergent uses of possibly uniform values. DenseSet DivergentUses; }; -} // End llvm namespace + +} // end namespace llvm #endif // LLVM_ANALYSIS_LEGACYDIVERGENCEANALYSIS_H diff --git a/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp b/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp --- a/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp +++ b/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp @@ -299,47 +299,25 @@ return new LegacyDivergenceAnalysis(); } -void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequiredTransitive(); - AU.addRequiredTransitive(); - AU.addRequiredTransitive(); - AU.setPreservesAll(); -} - -bool LegacyDivergenceAnalysis::shouldUseGPUDivergenceAnalysis( - const Function &F, const TargetTransformInfo &TTI) const { +bool LegacyDivergenceAnalysisImpl::shouldUseGPUDivergenceAnalysis( + const Function &F, const TargetTransformInfo &TTI, const LoopInfo &LI) { if (!(UseGPUDA || TTI.useGPUDivergenceAnalysis())) return false; // GPUDivergenceAnalysis requires a reducible CFG. - auto &LI = getAnalysis().getLoopInfo(); using RPOTraversal = ReversePostOrderTraversal; RPOTraversal FuncRPOT(&F); return !containsIrreducibleCFG(FuncRPOT, LI); } -bool LegacyDivergenceAnalysis::runOnFunction(Function &F) { - auto *TTIWP = getAnalysisIfAvailable(); - if (TTIWP == nullptr) - return false; - - TargetTransformInfo &TTI = TTIWP->getTTI(F); - // Fast path: if the target does not have branch divergence, we do not mark - // any branch as divergent. - if (!TTI.hasBranchDivergence()) - return false; - - DivergentValues.clear(); - DivergentUses.clear(); - gpuDA = nullptr; - - auto &DT = getAnalysis().getDomTree(); - auto &PDT = getAnalysis().getPostDomTree(); - - if (shouldUseGPUDivergenceAnalysis(F, TTI)) { +void LegacyDivergenceAnalysisImpl::run(Function &F, + llvm::TargetTransformInfo &TTI, + llvm::DominatorTree &DT, + llvm::PostDominatorTree &PDT, + const llvm::LoopInfo &LI) { + if (shouldUseGPUDivergenceAnalysis(F, TTI, LI)) { // run the new GPU divergence analysis - auto &LI = getAnalysis().getLoopInfo(); gpuDA = std::make_unique(F, DT, PDT, LI, TTI, /* KnownReducible = */ true); @@ -349,29 +327,24 @@ DP.populateWithSourcesOfDivergence(); DP.propagate(); } - - LLVM_DEBUG(dbgs() << "\nAfter divergence analysis on " << F.getName() - << ":\n"; - print(dbgs(), F.getParent())); - - return false; } -bool LegacyDivergenceAnalysis::isDivergent(const Value *V) const { +bool LegacyDivergenceAnalysisImpl::isDivergent(const Value *V) const { if (gpuDA) { return gpuDA->isDivergent(*V); } return DivergentValues.count(V); } -bool LegacyDivergenceAnalysis::isDivergentUse(const Use *U) const { +bool LegacyDivergenceAnalysisImpl::isDivergentUse(const Use *U) const { if (gpuDA) { return gpuDA->isDivergentUse(*U); } return DivergentValues.count(U->get()) || DivergentUses.count(U); } -void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const { +void LegacyDivergenceAnalysisImpl::print(raw_ostream &OS, + const Module *) const { if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty()) return; @@ -407,3 +380,56 @@ } OS << "\n"; } + +void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequiredTransitive(); + AU.addRequiredTransitive(); + AU.addRequiredTransitive(); + AU.setPreservesAll(); +} + +bool LegacyDivergenceAnalysis::runOnFunction(Function &F) { + auto *TTIWP = getAnalysisIfAvailable(); + if (TTIWP == nullptr) + return false; + + TargetTransformInfo &TTI = TTIWP->getTTI(F); + // Fast path: if the target does not have branch divergence, we do not mark + // any branch as divergent. + if (!TTI.hasBranchDivergence()) + return false; + + DivergentValues.clear(); + DivergentUses.clear(); + gpuDA = nullptr; + + auto &DT = getAnalysis().getDomTree(); + auto &PDT = getAnalysis().getPostDomTree(); + auto &LI = getAnalysis().getLoopInfo(); + LegacyDivergenceAnalysisImpl::run(F, TTI, DT, PDT, LI); + LLVM_DEBUG(dbgs() << "\nAfter divergence analysis on " << F.getName() + << ":\n"; + LegacyDivergenceAnalysisImpl::print(dbgs(), F.getParent())); + + return false; +} + +PreservedAnalyses +LegacyDivergenceAnalysisPass::run(Function &F, FunctionAnalysisManager &AM) { + auto &TTI = AM.getResult(F); + if (!TTI.hasBranchDivergence()) + return PreservedAnalyses::all(); + + DivergentValues.clear(); + DivergentUses.clear(); + gpuDA = nullptr; + + auto &DT = AM.getResult(F); + auto &PDT = AM.getResult(F); + auto &LI = AM.getResult(F); + LegacyDivergenceAnalysisImpl::run(F, TTI, DT, PDT, LI); + LLVM_DEBUG(dbgs() << "\nAfter divergence analysis on " << F.getName() + << ":\n"; + LegacyDivergenceAnalysisImpl::print(dbgs(), F.getParent())); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -46,6 +46,7 @@ #include "llvm/Analysis/InstCount.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/Lint.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopCacheAnalysis.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -308,6 +308,7 @@ FUNCTION_PASS("lint", LintPass()) FUNCTION_PASS("inject-tli-mappings", InjectTLIMappings()) FUNCTION_PASS("instnamer", InstructionNamerPass()) +FUNCTION_PASS("legacy-divergence-analysis", LegacyDivergenceAnalysisPass()) FUNCTION_PASS("loweratomic", LowerAtomicPass()) FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass()) FUNCTION_PASS("lower-guard-intrinsic", LowerGuardIntrinsicPass()) diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -1,8 +1,19 @@ +; RUN: opt -mtriple amdgcn-amdhsa -mcpu=gfx90a -passes=legacy-divergence-analysis < %s -S 2>&1 | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple amdgcn-amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.readfirstlane(i32) +; OPT-LABEL: define amdgpu_kernel void @readfirstlane_uniform( +; OPT-NEXT: %tid = tail call i32 @llvm.amdgcn.workitem.id.x() +; OPT-NEXT: %scalar = tail call i32 @llvm.amdgcn.readfirstlane(i32 %tid) +; OPT-NEXT: %idx = zext i32 %scalar to i64 +; OPT-NEXT: %gep0 = getelementptr inbounds float, ptr addrspace(1) %0, i64 %idx +; OPT-NEXT: %val = load float, ptr addrspace(1) %gep0, align 4 +; OPT-NEXT: %gep1 = getelementptr inbounds float, ptr addrspace(1) %1, i64 10 +; OPT-NEXT: store float %val, ptr addrspace(1) %gep1, align 4 +; OPT-NEXT: ret void +; ; GCN-LABEL: readfirstlane_uniform ; GCN: s_load_dwordx4 s[[[IN_ADDR:[0-9]+]]:3], s[4:5], 0x0 ; GCN: v_readfirstlane_b32 s[[SCALAR:[0-9]+]], v0