Index: include/llvm/ADT/PostOrderIterator.h =================================================================== --- include/llvm/ADT/PostOrderIterator.h +++ include/llvm/ADT/PostOrderIterator.h @@ -296,12 +296,15 @@ public: using rpo_iterator = typename std::vector::reverse_iterator; + using const_rpo_iterator = typename std::vector::const_reverse_iterator; ReversePostOrderTraversal(GraphT G) { Initialize(GT::getEntryNode(G)); } // Because we want a reverse post order, use reverse iterators from the vector rpo_iterator begin() { return Blocks.rbegin(); } + const_rpo_iterator begin() const { return Blocks.crbegin(); } rpo_iterator end() { return Blocks.rend(); } + const_rpo_iterator end() const { return Blocks.crend(); } }; } // end namespace llvm Index: include/llvm/Analysis/DivergenceAnalysis.h =================================================================== --- include/llvm/Analysis/DivergenceAnalysis.h +++ include/llvm/Analysis/DivergenceAnalysis.h @@ -7,55 +7,215 @@ // //===----------------------------------------------------------------------===// // -// The divergence analysis is an LLVM pass which can be used to find out -// if a branch instruction in a GPU program is divergent or not. It can help -// branch optimizations such as jump threading and loop unswitching to make -// better decisions. +// The divergence analysis determines which instructions and branches are +// divergent given a set of divergent source instructions. // //===----------------------------------------------------------------------===// -#ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H -#define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H + +#ifndef LLVM_ANALYSIS_DIVERGENCEANALYSIS_H +#define LLVM_ANALYSIS_DIVERGENCEANALYSIS_H #include "llvm/ADT/DenseSet.h" +#include "llvm/Analysis/SyncDependenceAnalysis.h" #include "llvm/IR/Function.h" #include "llvm/Pass.h" +#include namespace llvm { +class Module; class Value; -class DivergenceAnalysis : public FunctionPass { +class Instruction; +class Loop; +class raw_ostream; +class TargetTransformInfo; + +// generic divergence analysis +class DivergenceAnalysis { +public: + // this instance will analyze the whole function @F unless @regionLoop is + // given. In that case the analysis is restricted to @regionLoop. + // The analyzed part of the CFG is refered to as the region. + DivergenceAnalysis(const Function &F, const Loop *regionLoop, + const DominatorTree &DT, const LoopInfo &LI, + SyncDependenceAnalysis &SDA, bool IsLCSSAForm); + + // returns the loop that defines the analyzed region (if any) + const Loop *getRegionLoop() const { return regionLoop; } + const Function &getFunction() const { return F; } + + // whether @I is part of the region + bool inRegion(const BasicBlock &BB) const; + bool inRegion(const Instruction &I) const; + + // mark @uniVal as a value that is always uniform + void addUniformOverride(const Value &uniVal); + + // mark @divVal as a value that is always divergent + void markDivergent(const Value &divVal); + + // propagate divergence to all instructions in the region. + // Divergence is seeded by calls to @markDivergent. + void compute(); + + // whether any value was marked or analyzed to be divergent. + bool hasDetectedDivergence() const { return !divergentValues.empty(); } + + // whether @val will always return a uniform values regardless of its operands + bool isAlwaysUniform(const Value &val) const; + + // whether @val is a divergent value + bool isDivergent(const Value &val) const; + + void print(raw_ostream &OS, const Module *) const; + +private: + bool updateTerminator(const TerminatorInst &term) const; + bool updatePHINode(const PHINode &phi) const; + + // returns whether @term should be considered divergent based on the + // divergence of its operands. This should only be called for + // non-phi, non-terminator instructions. + bool updateNormalInstruction(const Instruction &term) const; + + // marks all users of live-out values of the loop headed by @loopHeader + // as divergent and puts them on the worklist. + void taintLoopLiveOuts(const BasicBlock &loopHeader); + + // push all users of @val (in the region) to the worklist + void pushUsers(const Value &I); + + // push all phi nodes in @block to the worklist + void pushPHINodes(const BasicBlock &block); + + // mark @block as join divergent + // A block is join divergent if two threads may reach it from different + // incoming blocks at the same time. + void markBlockJoinDivergent(const BasicBlock &block) { + divergentJoinBlocks.insert(&block); + } + + // @whether @val appears as a temporal divergent value when observed in + // @observingBlock + bool isTemporalDivergent(const BasicBlock &observingBlock, + const Value &val) const; + + // @whether @block is join divergent (see markBlockJoinDivergent) + bool isJoinDivergent(const BasicBlock &block) const { + return divergentJoinBlocks.count(&block); + } + + // propagate control-induced divergence to users (phi nodes and instructions) + // @joinBlock is a divergent loop exit or join point of two disjoint paths. + // returns whether @joinBlock is a divergent loop exit of @termLoop. + bool propagateJoinDivergence(const BasicBlock &joinBlock, + const Loop *termLoop); + + // propagate induced value divergence due to control divergence in @term + void propagateBranchDivergence(const TerminatorInst &term); + + // propagate induced value divergence due to loop exit divergence from + // @exitingLoop + void propagateLoopDivergence(const Loop &exitingLoop); + +private: + const Function &F; + // if regionLoop != nullptr, analyze only in the scope of the loop + // Otw, analyze the whole function + const Loop *regionLoop; + + const DominatorTree &DT; + const LoopInfo &LI; + + // recognized divergent loops + DenseSet divergentLoops; + + // The SDA links divergent branches to divergent control-flow joins + SyncDependenceAnalysis &SDA; + + // use simplified code path for LCSSA form + bool IsLCSSAForm; + + // set of known-uniform values + DenseSet uniformOverrides; + + // blocks with joining divergent control from different predecessors + DenseSet divergentJoinBlocks; + + // detected/marked divergent values + DenseSet divergentValues; + + // internal worklist for divergence propagation + std::vector worklist; +}; + +// divergence analysis frontend for loops +class LoopDivergenceAnalysis { +public: + LoopDivergenceAnalysis(const DominatorTree &DT, const LoopInfo &LI, + SyncDependenceAnalysis &SDA, const Loop &loop); + + // whether @V is divergent + bool isDivergent(const Value &val) const; + + // whether @V is uniform/non-divergent + bool isUniform(const Value &val) const { return !isDivergent(val); } + + // print all divergent values in the loop. + void print(raw_ostream &OS, const Module *) const; + +private: + DivergenceAnalysis DA; +}; + +// loop divergence printer pass - for standalone testing +class LoopDivergencePrinter : public FunctionPass { public: static char ID; - DivergenceAnalysis() : FunctionPass(ID) { - initializeDivergenceAnalysisPass(*PassRegistry::getPassRegistry()); + LoopDivergencePrinter() : FunctionPass(ID) { + initializeLoopDivergencePrinterPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override; + // analyze all loop-divergence of all loops in @F and print the results bool runOnFunction(Function &F) override; - // Print all divergent branches in the function. + // print all divergent values in the function void print(raw_ostream &OS, const Module *) const override; - // Returns true if V is divergent at its definition. - // - // Even if this function returns false, V may still be divergent when used - // in a different basic block. - bool isDivergent(const Value *V) const { return DivergentValues.count(V); } +private: + std::unique_ptr SDA; + SmallVector, 6> loopDivInfo; +}; - // Returns true if V is uniform/non-divergent. - // - // Even if this function returns true, V may still be divergent when used - // in a different basic block. - bool isUniform(const Value *V) const { return !isDivergent(V); } +// divergence analysis frontend for GPU kernels +class GPUDivergenceAnalysis { + SyncDependenceAnalysis SDA; + DivergenceAnalysis DA; - // Keep the analysis results uptodate by removing an erased value. - void removeValue(const Value *V) { DivergentValues.erase(V); } +public: + // runs the divergence analysis on @F, a GPU kernel + GPUDivergenceAnalysis(Function &F, const DominatorTree &DT, + const PostDominatorTree &PDT, const LoopInfo &LI, + const TargetTransformInfo &TTI); -private: - // Stores all divergent values. - DenseSet DivergentValues; + // whether any divergence was detected in @F + bool hasDivergence() const { return DA.hasDetectedDivergence(); } + + // the GPU kernel this analysis result is for + const Function &getFunction() const { return DA.getFunction(); } + + // whether @V is divergent + bool isDivergent(const Value &val) const; + + // whether @V is uniform/non-divergent + bool isUniform(const Value &val) const { return !isDivergent(val); } + + // print all divergent values in the kernel F + void print(raw_ostream &OS, const Module *) const; }; -} // End llvm namespace -#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H \ No newline at end of file +} // namespace llvm + +#endif // LLVM_ANALYSIS_DIVERGENCEANALYSIS_H Index: include/llvm/Analysis/KernelDivergenceAnalysis.h =================================================================== --- include/llvm/Analysis/KernelDivergenceAnalysis.h +++ include/llvm/Analysis/KernelDivergenceAnalysis.h @@ -1,4 +1,4 @@ -//===- llvm/Analysis/DivergenceAnalysis.h - Divergence Analysis -*- C++ -*-===// +//===- llvm/Analysis/KernelDivergenceAnalysis.h - KernelDivergence Analysis -*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// // -// The divergence analysis is an LLVM pass which can be used to find out -// if a branch instruction in a GPU program is divergent or not. It can help +// The kernel divergence analysis is an LLVM pass which can be used to find out +// if a branch instruction in a GPU program (kernel) is divergent or not. It can help // branch optimizations such as jump threading and loop unswitching to make // better decisions. // @@ -19,15 +19,17 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/IR/Function.h" #include "llvm/Pass.h" +#include "llvm/Analysis/DivergenceAnalysis.h" namespace llvm { class Value; -class DivergenceAnalysis : public FunctionPass { +class GPUDivergenceAnalysis; +class KernelDivergenceAnalysis : public FunctionPass { public: static char ID; - DivergenceAnalysis() : FunctionPass(ID) { - initializeDivergenceAnalysisPass(*PassRegistry::getPassRegistry()); + KernelDivergenceAnalysis() : FunctionPass(ID) { + initializeKernelDivergenceAnalysisPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override; @@ -41,7 +43,7 @@ // // Even if this function returns false, V may still be divergent when used // in a different basic block. - bool isDivergent(const Value *V) const { return DivergentValues.count(V); } + bool isDivergent(const Value *V) const; // Returns true if V is uniform/non-divergent. // @@ -53,9 +55,12 @@ void removeValue(const Value *V) { DivergentValues.erase(V); } private: + // (optional) handle to new DivergenceAnalysis + std::unique_ptr gpuDA; + // Stores all divergent values. DenseSet DivergentValues; }; } // End llvm namespace -#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H \ No newline at end of file +#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H Index: include/llvm/Analysis/Passes.h =================================================================== --- include/llvm/Analysis/Passes.h +++ include/llvm/Analysis/Passes.h @@ -61,10 +61,17 @@ //===--------------------------------------------------------------------===// // - // createDivergenceAnalysisPass - This pass determines which branches in a GPU + // createKernelDivergenceAnalysisPass - This pass determines which branches in a GPU // program are divergent. // - FunctionPass *createDivergenceAnalysisPass(); + FunctionPass *createKernelDivergenceAnalysisPass(); + + //===--------------------------------------------------------------------===// + // + // createLoopDivergencePrinterPass - This pass determines which branches and + // instructions in a loop are divergent. + // + FunctionPass *createLoopDivergencePrinterPass(); //===--------------------------------------------------------------------===// // Index: include/llvm/Analysis/SyncDependenceAnalysis.h =================================================================== --- /dev/null +++ include/llvm/Analysis/SyncDependenceAnalysis.h @@ -0,0 +1,75 @@ +//===- SyncDependenceAnalysis.h - Divergent Branch Dependence -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the SyncDependenceAnalysis class, which computes for +// every divergent branch the set of phi nodes that the branch will make +// divergent. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_BRANCHDEPENDENCEANALYSIS_H +#define LLVM_ANALYSIS_BRANCHDEPENDENCEANALYSIS_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/LoopInfo.h" + +namespace llvm { + +class BasicBlock; +class DominatorTree; +class Loop; +class PostDominatorTree; +class TerminatorInst; +class TerminatorInst; + +using ConstBlockSet = SmallPtrSet; + +// TODO add LLVM pass manager wrappings +class SyncDependenceAnalysis { + void visitSuccessor(const BasicBlock &succBlock, const Loop *termLoop, + const BasicBlock *defBlock); + +public: + bool inRegion(const BasicBlock &BB) const; + + ~SyncDependenceAnalysis(); + SyncDependenceAnalysis(const DominatorTree &DT, const PostDominatorTree &PDT, + const LoopInfo &LI); + + // the set of blocks which are reachable by disjoint paths from @term. + // The set also contains loop exits if there two disjoin paths: + // one from @term to the loop exit and another from @term to the loop header. + // Those exit blocks are added to the returned set. + // If L is the parent loop of @term and an exit of L is in the returned set + // then L is a divergent loop. + const ConstBlockSet &join_blocks(const TerminatorInst &term); + + // the set of blocks which are reachable by disjoin paths from the + // loop exits of @loop. + // This treats the loop as a single node in @loop's parent loop. + // The returned set has the same properties as for join_blocks(TermInst&). + const ConstBlockSet &join_blocks(const Loop &loop); + +private: + static ConstBlockSet emptyBlockSet; + + ReversePostOrderTraversal funcRPOT; + const DominatorTree &DT; + const PostDominatorTree &PDT; + const LoopInfo &LI; + + std::map cachedLoopExitJoins; + std::map cachedBranchJoins; +}; + +} // namespace llvm + +#endif // LLVM_ANALYSIS_BRANCHDEPENDENCEANALYSIS_H Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -289,7 +289,7 @@ /// Returns whether V is a source of divergence. /// /// This function provides the target-dependent information for - /// the target-independent DivergenceAnalysis. DivergenceAnalysis first + /// the target-independent KernelDivergenceAnalysis. KernelDivergenceAnalysis first /// builds the dependency graph, and then runs the reachability algorithm /// starting with the sources of divergence. bool isSourceOfDivergence(const Value *V) const; Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -28,7 +28,7 @@ #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/KernelDivergenceAnalysis.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -229,7 +229,7 @@ LLVMContext *Context; CodeGenOpt::Level OptLevel; - DivergenceAnalysis * DA = nullptr; + KernelDivergenceAnalysis * DA = nullptr; FunctionLoweringInfo * FLI = nullptr; /// The function-level optimization remark emitter. Used to emit remarks @@ -382,7 +382,7 @@ /// Prepare this SelectionDAG to process code in the given MachineFunction. void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, - DivergenceAnalysis * Divergence); + KernelDivergenceAnalysis * Divergence); void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) { FLI = FuncInfo; Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -29,7 +29,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/KernelDivergenceAnalysis.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/RuntimeLibcalls.h" @@ -2648,7 +2648,7 @@ virtual bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, - DivergenceAnalysis *DA) const { + KernelDivergenceAnalysis *DA) const { return false; } Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -118,6 +118,8 @@ void initializeDependenceAnalysisPass(PassRegistry&); void initializeDependenceAnalysisWrapperPassPass(PassRegistry&); void initializeDetectDeadLanesPass(PassRegistry&); +void initializeKernelDivergenceAnalysisPass(PassRegistry&); +void initializeLoopDivergencePrinterPass(PassRegistry&); void initializeDivRemPairsLegacyPassPass(PassRegistry&); void initializeDivergenceAnalysisPass(PassRegistry&); void initializeDomOnlyPrinterPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -94,7 +94,8 @@ (void) llvm::createDeadInstEliminationPass(); (void) llvm::createDeadStoreEliminationPass(); (void) llvm::createDependenceAnalysisWrapperPass(); - (void) llvm::createDivergenceAnalysisPass(); + (void) llvm::createKernelDivergenceAnalysisPass(); + (void) llvm::createLoopDivergencePrinterPass(); (void) llvm::createDomOnlyPrinterPass(); (void) llvm::createDomPrinterPass(); (void) llvm::createDomOnlyViewerPass(); Index: lib/Analysis/Analysis.cpp =================================================================== --- lib/Analysis/Analysis.cpp +++ lib/Analysis/Analysis.cpp @@ -39,7 +39,8 @@ initializeDependenceAnalysisWrapperPassPass(Registry); initializeDelinearizationPass(Registry); initializeDemandedBitsWrapperPassPass(Registry); - initializeDivergenceAnalysisPass(Registry); + initializeKernelDivergenceAnalysisPass(Registry); + initializeLoopDivergencePrinterPass(Registry); initializeDominanceFrontierWrapperPassPass(Registry); initializeDomViewerPass(Registry); initializeDomPrinterPass(Registry); Index: lib/Analysis/CMakeLists.txt =================================================================== --- lib/Analysis/CMakeLists.txt +++ lib/Analysis/CMakeLists.txt @@ -38,6 +38,7 @@ Interval.cpp IntervalPartition.cpp IteratedDominanceFrontier.cpp + KernelDivergenceAnalysis.cpp LazyBranchProbabilityInfo.cpp LazyBlockFrequencyInfo.cpp LazyCallGraph.cpp @@ -76,6 +77,7 @@ ScalarEvolutionAliasAnalysis.cpp ScalarEvolutionExpander.cpp ScalarEvolutionNormalization.cpp + SyncDependenceAnalysis.cpp SyntheticCountsUtils.cpp TargetLibraryInfo.cpp TargetTransformInfo.cpp Index: lib/Analysis/DivergenceAnalysis.cpp =================================================================== --- lib/Analysis/DivergenceAnalysis.cpp +++ lib/Analysis/DivergenceAnalysis.cpp @@ -7,9 +7,10 @@ // //===----------------------------------------------------------------------===// // -// This file implements divergence analysis which determines whether a branch -// in a GPU program is divergent.It can help branch optimizations such as jump -// threading and loop unswitching to make better decisions. +// This file implements a general divergence analysis for loop vectorization +// and GPU programs. It determines whether a branch in a loop or GPU program +// is divergent. It can help branch optimizations such as jump threading and +// loop unswitching to make better decisions. // // GPU programs typically use the SIMD execution model, where multiple threads // in the same execution group have to execute in lock-step. Therefore, if the @@ -25,16 +26,23 @@ // optimizations. // // This file defines divergence analysis which computes a conservative but -// non-trivial approximation of all divergent branches in a GPU program. It -// partially implements the approach described in +// non-trivial approximation of all divergent branches in a GPU program. This +// implementation is derived from the Vectorization Analysis of the Region +// Vectorizer (RV). That implementation in turn is based on the approach +// described in // -// Divergence Analysis -// Sampaio, Souza, Collange, Pereira -// TOPLAS '13 +// Improving Performance of OpenCL on CPUs +// Ralf Karrenberg and Sebastian Hack +// CC '12 // -// The divergence analysis identifies the sources of divergence (e.g., special -// variables that hold the thread ID), and recursively marks variables that are -// data or sync dependent on a source of divergence as divergent. +// The DivergenceAnalysis implementation is generic in the sense that it doe +// not itself identify original sources of divergence. +// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and +// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence +// (e.g., special variables that hold the thread ID or the iteration variable). +// +// The generic implementation propagates divergence to variables that are data +// or sync dependent on a source of divergence. // // While data dependency is a well-known concept, the notion of sync dependency // is worth more explanation. Sync dependence characterizes the control flow @@ -54,287 +62,476 @@ // because the branch "br i1 %cond" depends on %tid and affects which value %a // is assigned to. // -// The current implementation has the following limitations: +// The sync dependence detection (which branch induces divergence in which join +// points) is implemented in the SyncDependenceAnalysis. +// +// The current DivergenceAnalysis implementation has the following limitations: // 1. intra-procedural. It conservatively considers the arguments of a // non-kernel-entry function and the return value of a function call as // divergent. // 2. memory as black box. It conservatively considers values loaded from // generic or local address as divergent. This can be improved by leveraging -// pointer analysis. +// pointer analysis and/or by modelling non-escaping memory objects in SSA +// as done in RV. // //===----------------------------------------------------------------------===// #include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Value.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include + using namespace llvm; -#define DEBUG_TYPE "divergence" - -namespace { - -class DivergencePropagator { -public: - DivergencePropagator(Function &F, TargetTransformInfo &TTI, DominatorTree &DT, - PostDominatorTree &PDT, DenseSet &DV) - : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {} - void populateWithSourcesOfDivergence(); - void propagate(); - -private: - // A helper function that explores data dependents of V. - void exploreDataDependency(Value *V); - // A helper function that explores sync dependents of TI. - void exploreSyncDependency(TerminatorInst *TI); - // Computes the influence region from Start to End. This region includes all - // basic blocks on any simple path from Start to End. - void computeInfluenceRegion(BasicBlock *Start, BasicBlock *End, - DenseSet &InfluenceRegion); - // Finds all users of I that are outside the influence region, and add these - // users to Worklist. - void findUsersOutsideInfluenceRegion( - Instruction &I, const DenseSet &InfluenceRegion); - - Function &F; - TargetTransformInfo &TTI; - DominatorTree &DT; - PostDominatorTree &PDT; - std::vector Worklist; // Stack for DFS. - DenseSet &DV; // Stores all divergent values. -}; - -void DivergencePropagator::populateWithSourcesOfDivergence() { - Worklist.clear(); - DV.clear(); - for (auto &I : instructions(F)) { - if (TTI.isSourceOfDivergence(&I)) { - Worklist.push_back(&I); - DV.insert(&I); +#define DEBUG_TYPE "divergence-analysis" + +// class DivergenceAnalysis +DivergenceAnalysis::DivergenceAnalysis( + const Function &F, const Loop *regionLoop, const DominatorTree &DT, + const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm) + : F(F), regionLoop(regionLoop), DT(DT), LI(LI), SDA(SDA), + IsLCSSAForm(IsLCSSAForm) {} + +void DivergenceAnalysis::markDivergent(const Value &divVal) { + assert(isa(divVal) || isa(divVal)); + assert(!isAlwaysUniform(divVal) && "can not be a divergent"); + divergentValues.insert(&divVal); +} + +void DivergenceAnalysis::addUniformOverride(const Value &uniVal) { + uniformOverrides.insert(&uniVal); +} + +bool DivergenceAnalysis::updateTerminator(const TerminatorInst &term) const { + if (term.getNumSuccessors() <= 1) + return false; + if (auto *branchInst = dyn_cast(&term)) { + assert(branchInst->isConditional()); + return isDivergent(*branchInst->getCondition()); + } else if (auto *switchInst = dyn_cast(&term)) { + return isDivergent(*switchInst->getCondition()); + } else if (isa(term)) { + return false; // ignore abnormal executions through landingpad + } else { + abort(); + } +} + +bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const { + // TODO function calls with side effects, etc + for (const auto &op : I.operands()) { + if (isDivergent(*op)) + return true; + } + return false; +} + +bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &observingBlock, + const Value &val) const { + const auto *inst = dyn_cast(&val); + if (!inst) + return false; + const auto *observingLoop = LI.getLoopFor(&observingBlock); + for (const auto *loop = LI.getLoopFor(inst->getParent()); + loop != observingLoop; loop = loop->getParentLoop()) { + if (divergentLoops.count(loop)) + return true; + } + + return false; +} + +bool DivergenceAnalysis::updatePHINode(const PHINode &phi) const { + // joining divergent disjoint path in @phi parent block + if (!phi.hasConstantOrUndefValue() && isJoinDivergent(*phi.getParent())) { + return true; + } + + // An incoming value could be divergent by itself. + // Otherwise, an incoming value could be uniform within the loop + // that carries its definition but it may appear divergent + // from outside the loop. This happens when divergent loop exits + // drop definitions of that uniform value in different iterations. + // + // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop + // if (i*i % 7 == 0) break; // divergent loop exit + // } + // int divI = i; // divI is divergent + for (size_t i = 0; i < phi.getNumIncomingValues(); ++i) { + const auto *inVal = phi.getIncomingValue(i); + if (isDivergent(*phi.getIncomingValue(i)) || + isTemporalDivergent(*phi.getParent(), *inVal)) { + return true; } } - for (auto &Arg : F.args()) { - if (TTI.isSourceOfDivergence(&Arg)) { - Worklist.push_back(&Arg); - DV.insert(&Arg); + return false; +} + +bool DivergenceAnalysis::inRegion(const Instruction &I) const { + return I.getParent() && inRegion(*I.getParent()); +} + +bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const { + return (!regionLoop && BB.getParent() == &F) || regionLoop->contains(&BB); +} + +// marks all users of loop-carried values of the loop headed by @loopHeader as +// divergent +void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &loopHeader) { + auto *divLoop = LI.getLoopFor(&loopHeader); + assert(divLoop && "loopHeader is not actually part of a loop"); + + SmallVector taintStack; + divLoop->getExitBlocks(taintStack); + + // Otherwise potential users of loop-carried values could be anywhere in the + // dominance region of @divLoop (including its fringes for phi nodes) + DenseSet visited; + for (auto *block : taintStack) { + visited.insert(block); + } + visited.insert(&loopHeader); + + while (!taintStack.empty()) { + auto *userBlock = taintStack.back(); + taintStack.pop_back(); + + // don't spread divergence beyond the region + if (!inRegion(*userBlock)) + continue; + + assert(!divLoop->contains(userBlock) && + "irreducible control flow detected"); + + // phi nodes at the fringes of the dominance region + if (!DT.dominates(&loopHeader, userBlock)) { + // all PHI nodes of @userBlock become divergent + for (auto &blockInst : *userBlock) { + if (!isa(blockInst)) + break; + worklist.push_back(&blockInst); + } + continue; + } + + // taint outside users of values carried by divLoop + for (auto &I : *userBlock) { + if (isAlwaysUniform(I)) + continue; + if (isDivergent(I)) + continue; + + for (auto &Op : I.operands()) { + auto *opInst = dyn_cast(&Op); + if (!opInst) + continue; + if (divLoop->contains(opInst->getParent())) { + markDivergent(I); + pushUsers(I); + break; + } + } + } + + // visit all blocks in the dominance region + for (auto *succBlock : successors(userBlock)) { + if (!visited.insert(userBlock).second) + continue; + taintStack.push_back(succBlock); } } } -void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) { - // Propagation rule 1: if branch TI is divergent, all PHINodes in TI's - // immediate post dominator are divergent. This rule handles if-then-else - // patterns. For example, - // - // if (tid < 5) - // a1 = 1; - // else - // a2 = 2; - // a = phi(a1, a2); // sync dependent on (tid < 5) - BasicBlock *ThisBB = TI->getParent(); - - // Unreachable blocks may not be in the dominator tree. - if (!DT.isReachableFromEntry(ThisBB)) - return; +void DivergenceAnalysis::pushPHINodes(const BasicBlock &block) { + for (const auto &inst : block) { + if (!isa(inst)) + continue; + if (isDivergent(inst)) + continue; + worklist.push_back(&inst); + } +} - // If the function has no exit blocks or doesn't reach any exit blocks, the - // post dominator may be null. - DomTreeNode *ThisNode = PDT.getNode(ThisBB); - if (!ThisNode) - return; +void DivergenceAnalysis::pushUsers(const Value &V) { + for (const auto *user : V.users()) { + const auto *userInst = dyn_cast(user); + if (!userInst) + continue; - BasicBlock *IPostDom = ThisNode->getIDom()->getBlock(); - if (IPostDom == nullptr) - return; + if (isDivergent(*userInst)) + continue; - for (auto I = IPostDom->begin(); isa(I); ++I) { - // A PHINode is uniform if it returns the same value no matter which path is - // taken. - if (!cast(I)->hasConstantOrUndefValue() && DV.insert(&*I).second) - Worklist.push_back(&*I); + // only compute divergent inside loop + if (!inRegion(*userInst)) + continue; + worklist.push_back(userInst); } +} - // Propagation rule 2: if a value defined in a loop is used outside, the user - // is sync dependent on the condition of the loop exits that dominate the - // user. For example, - // - // int i = 0; - // do { - // i++; - // if (foo(i)) ... // uniform - // } while (i < tid); - // if (bar(i)) ... // divergent - // - // A program may contain unstructured loops. Therefore, we cannot leverage - // LoopInfo, which only recognizes natural loops. - // - // The algorithm used here handles both natural and unstructured loops. Given - // a branch TI, we first compute its influence region, the union of all simple - // paths from TI to its immediate post dominator (IPostDom). Then, we search - // for all the values defined in the influence region but used outside. All - // these users are sync dependent on TI. - DenseSet InfluenceRegion; - computeInfluenceRegion(ThisBB, IPostDom, InfluenceRegion); - // An insight that can speed up the search process is that all the in-region - // values that are used outside must dominate TI. Therefore, instead of - // searching every basic blocks in the influence region, we search all the - // dominators of TI until it is outside the influence region. - BasicBlock *InfluencedBB = ThisBB; - while (InfluenceRegion.count(InfluencedBB)) { - for (auto &I : *InfluencedBB) - findUsersOutsideInfluenceRegion(I, InfluenceRegion); - DomTreeNode *IDomNode = DT.getNode(InfluencedBB)->getIDom(); - if (IDomNode == nullptr) - break; - InfluencedBB = IDomNode->getBlock(); +bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &joinBlock, + const Loop *branchLoop) { + LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << joinBlock.getName() << "\n"); + + // ignore divergence outside the region + if (!inRegion(joinBlock)) + return false; + + // push non-divergent phi nodes in @joinBlock to the worklist + pushPHINodes(joinBlock); + + // @joinBlock is a divergent loop exit + if (branchLoop && !branchLoop->contains(&joinBlock)) { + return true; + + } else { + // disjoint-paths divergente at @joinBlock + markBlockJoinDivergent(joinBlock); + return false; } } -void DivergencePropagator::findUsersOutsideInfluenceRegion( - Instruction &I, const DenseSet &InfluenceRegion) { - for (User *U : I.users()) { - Instruction *UserInst = cast(U); - if (!InfluenceRegion.count(UserInst->getParent())) { - if (DV.insert(UserInst).second) - Worklist.push_back(UserInst); +void DivergenceAnalysis::propagateBranchDivergence(const TerminatorInst &term) { + LLVM_DEBUG(dbgs() << "propBranchDiv " << term.getParent()->getName() << "\n"); + + markDivergent(term); + + const auto *branchLoop = LI.getLoopFor(term.getParent()); + + // whether there is a divergent loop exit from @branchLoop (if any) + bool isBranchLoopDivergent = false; + + // iterate over all blocks reachable by disjoint from @term within the loop + // also iterates over loop exits that become divergent due to @term. + for (const auto *joinBlock : SDA.join_blocks(term)) { + isBranchLoopDivergent |= propagateJoinDivergence(*joinBlock, branchLoop); + } + + // @branch loop is a divergent loop due to the divergent branch in @term + if (isBranchLoopDivergent) { + assert(branchLoop); + if (!divergentLoops.insert(branchLoop).second) { + return; } + propagateLoopDivergence(*branchLoop); } } -// A helper function for computeInfluenceRegion that adds successors of "ThisBB" -// to the influence region. -static void -addSuccessorsToInfluenceRegion(BasicBlock *ThisBB, BasicBlock *End, - DenseSet &InfluenceRegion, - std::vector &InfluenceStack) { - for (BasicBlock *Succ : successors(ThisBB)) { - if (Succ != End && InfluenceRegion.insert(Succ).second) - InfluenceStack.push_back(Succ); +void DivergenceAnalysis::propagateLoopDivergence(const Loop &exitingLoop) { + LLVM_DEBUG(dbgs() << "propLoopDiv " << exitingLoop.getName() << "\n"); + + // don't propagate beyond region + if (!inRegion(*exitingLoop.getHeader())) + return; + + const auto *branchLoop = exitingLoop.getParentLoop(); + + // Uses of loop-carried values could occur anywhere + // within the dominance region of the definition. All loop-carried + // definitions are dominated by the loop header (reducible control). + // Thus all users have to be in the dominance region of the loop header, + // except PHI nodes that can also live at the fringes of the dom region + // (incoming defining value). + if (!IsLCSSAForm) + taintLoopLiveOuts(*exitingLoop.getHeader()); + + // whether there is a divergent loop exit from @branchLoop (if any) + bool isBranchLoopDivergent = false; + + // iterate over all blocks reachable by disjoint paths from exits of + // @exitingLoop also iterates over loop exits (of @branchLoop) that in turn + // become divergent. + for (const auto *joinBlock : SDA.join_blocks(exitingLoop)) { + isBranchLoopDivergent |= propagateJoinDivergence(*joinBlock, branchLoop); + } + + // @branch loop is a divergent due to divergent loop exit in @exitingLoop + if (isBranchLoopDivergent) { + assert(branchLoop); + if (!divergentLoops.insert(branchLoop).second) { + return; + } + propagateLoopDivergence(*branchLoop); } } -void DivergencePropagator::computeInfluenceRegion( - BasicBlock *Start, BasicBlock *End, - DenseSet &InfluenceRegion) { - assert(PDT.properlyDominates(End, Start) && - "End does not properly dominate Start"); - - // The influence region starts from the end of "Start" to the beginning of - // "End". Therefore, "Start" should not be in the region unless "Start" is in - // a loop that doesn't contain "End". - std::vector InfluenceStack; - addSuccessorsToInfluenceRegion(Start, End, InfluenceRegion, InfluenceStack); - while (!InfluenceStack.empty()) { - BasicBlock *BB = InfluenceStack.back(); - InfluenceStack.pop_back(); - addSuccessorsToInfluenceRegion(BB, End, InfluenceRegion, InfluenceStack); +void DivergenceAnalysis::compute() { + for (auto *divVal : divergentValues) { + pushUsers(*divVal); + } + + // propagate divergence + while (!worklist.empty()) { + const Instruction &I = *worklist.back(); + worklist.pop_back(); + + // maintain uniformity of overrides + if (isAlwaysUniform(I)) + continue; + + bool wasDivergent = isDivergent(I); + if (wasDivergent) + continue; + + // propagate divergence caused by terminator + if (isa(I)) { + auto &term = cast(I); + if (updateTerminator(term)) { + // propagate control divergence to affected instructions + propagateBranchDivergence(term); + continue; + } + } + + // update divergence of I due to divergent operands + bool divergentUpd = false; + if (isa(I)) { + divergentUpd = updatePHINode(cast(I)); + } else { + divergentUpd = updateNormalInstruction(I); + } + + // propagate value divergence to users + if (divergentUpd) { + markDivergent(I); + pushUsers(I); + } } } -void DivergencePropagator::exploreDataDependency(Value *V) { - // Follow def-use chains of V. - for (User *U : V->users()) { - Instruction *UserInst = cast(U); - if (!TTI.isAlwaysUniform(U) && DV.insert(UserInst).second) - Worklist.push_back(UserInst); +bool DivergenceAnalysis::isAlwaysUniform(const Value &val) const { + return uniformOverrides.count(&val); +} + +bool DivergenceAnalysis::isDivergent(const Value &val) const { + return divergentValues.count(&val); +} + +void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { + if (divergentValues.empty()) + return; + // iterate instructions using instructions() to ensure a deterministic order. + for (auto &I : instructions(F)) { + if (divergentValues.count(&I)) + OS << "DIVERGENT:" << I << "\n"; } } -void DivergencePropagator::propagate() { - // Traverse the dependency graph using DFS. - while (!Worklist.empty()) { - Value *V = Worklist.back(); - Worklist.pop_back(); - if (TerminatorInst *TI = dyn_cast(V)) { - // Terminators with less than two successors won't introduce sync - // dependency. Ignore them. - if (TI->getNumSuccessors() > 1) - exploreSyncDependency(TI); - } - exploreDataDependency(V); +// class LoopDivergenceAnalysis +LoopDivergenceAnalysis::LoopDivergenceAnalysis(const DominatorTree &DT, + const LoopInfo &LI, + SyncDependenceAnalysis &SDA, + const Loop &loop) + : DA(*loop.getHeader()->getParent(), &loop, DT, LI, SDA, true) { + for (const auto &I : *loop.getHeader()) { + if (!isa(I)) + break; + DA.markDivergent(I); } + + // after the scalar remainder loop is extracted, the loop exit condition will + // be uniform + auto loopExitingInst = loop.getExitingBlock()->getTerminator(); + auto loopExitCond = cast(loopExitingInst)->getCondition(); + DA.addUniformOverride(*loopExitCond); + + DA.compute(); } -} /// end namespace anonymous +bool LoopDivergenceAnalysis::isDivergent(const Value &val) const { + return DA.isDivergent(val); +} + +void LoopDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { + OS << "Divergence of loop " << DA.getRegionLoop()->getName() << " {\n"; + DA.print(OS, mod); + OS << "}\n"; +} + +// class LoopDivergencePrinter +bool LoopDivergencePrinter::runOnFunction(Function &F) { + const PostDominatorTree &PDT = + getAnalysis().getPostDomTree(); + const DominatorTree &DT = + getAnalysis().getDomTree(); + const LoopInfo &LI = getAnalysis().getLoopInfo(); + SDA = make_unique(DT, PDT, LI); + + for (auto &BB : F) { + auto *loop = LI.getLoopFor(&BB); + if (!loop || loop->getHeader() != &BB) + continue; + loopDivInfo.push_back( + make_unique(DT, LI, *SDA, *loop)); + } + + return false; +} + +void LoopDivergencePrinter::print(raw_ostream &OS, const Module *mod) const { + for (auto &divInfo : loopDivInfo) { + divInfo->print(OS, mod); + } +} // Register this pass. -char DivergenceAnalysis::ID = 0; -INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis", - false, true) +char LoopDivergencePrinter::ID = 0; +INITIALIZE_PASS_BEGIN(LoopDivergencePrinter, "loop-divergence", + "Loop Divergence Printer", false, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis", - false, true) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(LoopDivergencePrinter, "loop-divergence", + "Loop Divergence Printer", false, true) -FunctionPass *llvm::createDivergenceAnalysisPass() { - return new DivergenceAnalysis(); +FunctionPass *llvm::createLoopDivergencePrinterPass() { + return new LoopDivergencePrinter(); } -void DivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { +void LoopDivergencePrinter::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.setPreservesAll(); } -bool DivergenceAnalysis::runOnFunction(Function &F) { - auto *TTIWP = getAnalysisIfAvailable(); - if (TTIWP == nullptr) - return false; - - TargetTransformInfo &TTI = TTIWP->getTTI(F); - // Fast path: if the target does not have branch divergence, we do not mark - // any branch as divergent. - if (!TTI.hasBranchDivergence()) - return false; +// class GPUDivergenceAnalysis +GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F, + const DominatorTree &DT, + const PostDominatorTree &PDT, + const LoopInfo &LI, + const TargetTransformInfo &TTI) + : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) { + for (auto &I : instructions(F)) { + if (TTI.isSourceOfDivergence(&I)) { + DA.markDivergent(I); + } else if (TTI.isAlwaysUniform(&I)) { + DA.addUniformOverride(I); + } + } + for (auto &Arg : F.args()) { + if (TTI.isSourceOfDivergence(&Arg)) { + DA.markDivergent(Arg); + } + } - DivergentValues.clear(); - auto &PDT = getAnalysis().getPostDomTree(); - DivergencePropagator DP(F, TTI, - getAnalysis().getDomTree(), - PDT, DivergentValues); - DP.populateWithSourcesOfDivergence(); - DP.propagate(); - LLVM_DEBUG( - dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n"; - print(dbgs(), F.getParent()) - ); - return false; + DA.compute(); } -void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { - if (DivergentValues.empty()) - return; - const Value *FirstDivergentValue = *DivergentValues.begin(); - const Function *F; - if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { - F = Arg->getParent(); - } else if (const Instruction *I = - dyn_cast(FirstDivergentValue)) { - F = I->getParent()->getParent(); - } else { - llvm_unreachable("Only arguments and instructions can be divergent"); - } +bool GPUDivergenceAnalysis::isDivergent(const Value &val) const { + return DA.isDivergent(val); +} - // Dumps all divergent values in F, arguments and then instructions. - for (auto &Arg : F->args()) { - OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : " "); - OS << Arg << "\n"; - } - // Iterate instructions using instructions() to ensure a deterministic order. - for (auto BI = F->begin(), BE = F->end(); BI != BE; ++BI) { - auto &BB = *BI; - OS << "\n " << BB.getName() << ":\n"; - for (auto &I : BB.instructionsWithoutDebug()) { - OS << (DivergentValues.count(&I) ? "DIVERGENT: " : " "); - OS << I << "\n"; - } - } - OS << "\n"; +void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { + OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; + DA.print(OS, mod); + OS << "}\n"; } Index: lib/Analysis/KernelDivergenceAnalysis.cpp =================================================================== --- lib/Analysis/KernelDivergenceAnalysis.cpp +++ lib/Analysis/KernelDivergenceAnalysis.cpp @@ -1,4 +1,4 @@ -//===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==// +//===- KernelDivergenceAnalysis.cpp --------- Kernel Divergence Analysis Implementation -==// // // The LLVM Compiler Infrastructure // @@ -65,6 +65,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/KernelDivergenceAnalysis.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -79,6 +80,12 @@ #define DEBUG_TYPE "divergence" +// transparently use the GPUDivergenceAnalysis +static cl::opt UseGPUDA( + "use-gpu-da", cl::init(false), cl::Hidden, + cl::desc( + "turn the KernelDivergenceAnalysis into a wrapper for GPUDivergenceAnalysis")); + namespace { class DivergencePropagator { @@ -265,25 +272,27 @@ } /// end namespace anonymous // Register this pass. -char DivergenceAnalysis::ID = 0; -INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis", +char KernelDivergenceAnalysis::ID = 0; +INITIALIZE_PASS_BEGIN(KernelDivergenceAnalysis, "divergence", "Kernel Divergence Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis", +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(KernelDivergenceAnalysis, "divergence", "Kernel Divergence Analysis", false, true) -FunctionPass *llvm::createDivergenceAnalysisPass() { - return new DivergenceAnalysis(); +FunctionPass *llvm::createKernelDivergenceAnalysisPass() { + return new KernelDivergenceAnalysis(); } -void DivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { +void KernelDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); + if (UseGPUDA) AU.addRequired(); AU.setPreservesAll(); } -bool DivergenceAnalysis::runOnFunction(Function &F) { +bool KernelDivergenceAnalysis::runOnFunction(Function &F) { auto *TTIWP = getAnalysisIfAvailable(); if (TTIWP == nullptr) return false; @@ -295,36 +304,60 @@ return false; DivergentValues.clear(); + gpuDA = nullptr; + + auto &DT = getAnalysis().getDomTree(); auto &PDT = getAnalysis().getPostDomTree(); - DivergencePropagator DP(F, TTI, - getAnalysis().getDomTree(), - PDT, DivergentValues); - DP.populateWithSourcesOfDivergence(); - DP.propagate(); + + if (UseGPUDA) { + // run the new GPU divergence analysis + auto &LI = getAnalysis().getLoopInfo(); + gpuDA = llvm::make_unique(F, DT, PDT, LI, TTI); + + } else { + // run LLVM's existing DivergenceAnalysis + DivergencePropagator DP(F, TTI, + DT, + PDT, DivergentValues); + DP.populateWithSourcesOfDivergence(); + DP.propagate(); + } + LLVM_DEBUG( dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n"; print(dbgs(), F.getParent()) ); + return false; } -void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { - if (DivergentValues.empty()) +bool KernelDivergenceAnalysis::isDivergent(const Value *V) const { + if (gpuDA) return gpuDA->isDivergent(*V); + else return DivergentValues.count(V); +} + +void KernelDivergenceAnalysis::print(raw_ostream &OS, const Module *) const { + if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty()) return; - const Value *FirstDivergentValue = *DivergentValues.begin(); + const Function *F; - if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { - F = Arg->getParent(); - } else if (const Instruction *I = - dyn_cast(FirstDivergentValue)) { - F = I->getParent()->getParent(); - } else { - llvm_unreachable("Only arguments and instructions can be divergent"); + if (!DivergentValues.empty()) { + const Value *FirstDivergentValue = *DivergentValues.begin(); + if (const Argument *Arg = dyn_cast(FirstDivergentValue)) { + F = Arg->getParent(); + } else if (const Instruction *I = + dyn_cast(FirstDivergentValue)) { + F = I->getParent()->getParent(); + } else { + llvm_unreachable("Only arguments and instructions can be divergent"); + } + } else if (gpuDA) { + F = &gpuDA->getFunction(); } // Dumps all divergent values in F, arguments and then instructions. for (auto &Arg : F->args()) { - OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : " "); + OS << (isDivergent(&Arg) ? "DIVERGENT: " : " "); OS << Arg << "\n"; } // Iterate instructions using instructions() to ensure a deterministic order. @@ -332,7 +365,7 @@ auto &BB = *BI; OS << "\n " << BB.getName() << ":\n"; for (auto &I : BB.instructionsWithoutDebug()) { - OS << (DivergentValues.count(&I) ? "DIVERGENT: " : " "); + OS << (isDivergent(&I) ? "DIVERGENT: " : " "); OS << I << "\n"; } } Index: lib/Analysis/SyncDependenceAnalysis.cpp =================================================================== --- /dev/null +++ lib/Analysis/SyncDependenceAnalysis.cpp @@ -0,0 +1,382 @@ +//===- SyncDependenceAnalysis.cpp - Divergent Branch Dependence Calculation +//--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements an algorithm that returns for a divergent branch +// the set of basic blocks whose phi nodes become divergent due to divergent +// control. These are the blocks that are reachable by two disjoint paths from +// the branch or loop exits that have a reaching path that is disjoint from a +// path to the loop latch. +// +// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model +// control-induced divergence in phi nodes. +// +// -- Summary -- +// The SyncDependenceAnalysis lazily computes sync dependences [3]. +// The analysis evaluates the disjoint path criterion [2] by a reduction +// to SSA construction. The SSA construction algorithm is implemented as +// a simple data-flow analysis [1]. +// +// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy +// [2] "Efficiently Computing Static Single Assignment Form +// and the Control Dependence Graph", TOPLAS '91, +// Cytron, Ferrante, Rosen, Wegman and Zadeck +// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack +// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira +// +// -- Sync dependence -- +// Sync dependence [4] characterizes the control flow aspect of the +// propagation of branch divergence. For example, +// +// %cond = icmp slt i32 %tid, 10 +// br i1 %cond, label %then, label %else +// then: +// br label %merge +// else: +// br label %merge +// merge: +// %a = phi i32 [ 0, %then ], [ 1, %else ] +// +// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid +// because %tid is not on its use-def chains, %a is sync dependent on %tid +// because the branch "br i1 %cond" depends on %tid and affects which value %a +// is assigned to. +// +// -- Reduction to SSA construction -- +// There are two disjoint paths from A to X, if a certain variant of SSA +// construction places a phi node in X under the following set-up scheme [2]. +// +// This variant of SSA construction ignores incoming undef values. +// That is paths from the entry without a definition do not result in +// phi nodes. +// +// entry +// / \ +// A \ +// / \ Y +// B C / +// \ / \ / +// D E +// \ / +// F +// Assume that A contains a divergent branch. We are interested +// in the set of all blocks where each block is reachable from A +// via two disjoint paths. This would be the set {D, F} in this +// case. +// To generally reduce this query to SSA construction we introduce +// a virtual variable x and assign to x different values in each +// successor block of A. +// entry +// / \ +// A \ +// / \ Y +// x = 0 x = 1 / +// \ / \ / +// D E +// \ / +// F +// Our flavor of SSA construction for x will construct the following +// entry +// / \ +// A \ +// / \ Y +// x0 = 0 x1 = 1 / +// \ / \ / +// x2=phi E +// \ / +// x3=phi +// The blocks D and F contain phi nodes and are thus each reachable +// by two disjoins paths from A. +// +// -- Remarks -- +// In case of loop exits we need to check the disjoint path criterion for loops +// [2]. To this end, we check whether the definition of x differs between the +// loop exit and the loop header (_after_ SSA construction). +// +//===----------------------------------------------------------------------===// +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/SyncDependenceAnalysis.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" + +#include +#include + +#define DEBUG_TYPE "sync-dependence" + +namespace llvm { + +ConstBlockSet SyncDependenceAnalysis::emptyBlockSet; + +SyncDependenceAnalysis::SyncDependenceAnalysis(const DominatorTree &DT, + const PostDominatorTree &PDT, + const LoopInfo &LI) + : funcRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI) {} + +SyncDependenceAnalysis::~SyncDependenceAnalysis() { + for (auto it : cachedBranchJoins) { + delete it.second; + } + for (auto it : cachedLoopExitJoins) { + delete it.second; + } +} + +using FunctionRPOT = ReversePostOrderTraversal; + +// divergence propagator for reducible CFGs +struct DivergencePropagator { + const FunctionRPOT &funcRPOT; + const DominatorTree &DT; + const PostDominatorTree &PDT; + const LoopInfo &LI; + + // identified join points + ConstBlockSet *const joinBlocks; + + // reached loop exits (by a path disjoint to a path to the loop header) + SmallPtrSet reachedLoopExits; + + // if defMap[B] == C then C is the dominating definition at block B + // if defMap[B] ~ undef then we haven't seen B yet + // if defMap[B] == B then B is a join point of disjoint paths from X + using DefMap = std::map; + DefMap defMap; + + // all blocks with pending visits + std::unordered_set pendingUpdates; + + DivergencePropagator(const FunctionRPOT &funcRPOT, const DominatorTree &DT, + const PostDominatorTree &PDT, const LoopInfo &LI) + : funcRPOT(funcRPOT), DT(DT), PDT(PDT), LI(LI), + joinBlocks(new ConstBlockSet) {} + + // set the definition at @block and mark @block as pending for a visit + void addPending(const BasicBlock &block, const BasicBlock &defBlock) { + bool wasAdded = defMap.emplace(&block, &defBlock).second; + if (wasAdded) + pendingUpdates.insert(&block); + } + + void printDefs(raw_ostream &out) { + out << "Propagator::defMap {\n"; + for (const auto *block : funcRPOT) { + auto it = defMap.find(block); + out << block->getName() << " : "; + if (it == defMap.end()) { + out << "\n"; + } else { + const auto *defBlock = it->second; + out << (defBlock ? defBlock->getName() : "") << "\n"; + } + } + out << "}\n"; + } + + // process @succBlock with reaching definition @defBlock + // the original divergent branch was in @parentLoop (if any) + void visitSuccessor(const BasicBlock &succBlock, const Loop *parentLoop, + const BasicBlock &defBlock) { + + // @succBlock is a loop exit + if (parentLoop && !parentLoop->contains(&succBlock)) { + defMap.emplace(&succBlock, &defBlock); + reachedLoopExits.insert(&succBlock); + return; + } + + // first reaching def? + auto itLastDef = defMap.find(&succBlock); + if (itLastDef == defMap.end()) { + addPending(succBlock, defBlock); + return; + } + + // a join of at least two definitions + if (itLastDef->second != &defBlock) { + // do we know this join already? + if (!joinBlocks->insert(&succBlock).second) + return; + + // update the definition + addPending(succBlock, succBlock); + } + } + + // find all blocks reachable by two disjoint paths from @rootTerm. + // This method works for both divergent TerminatorInsts and loops with + // divergent exits. + // @rootBlock is either the block containing the branch or the header of the + // divergent loop. + // @nodeSuccessors is the set of successors of the node (Loop or Terminator) + // headed by @rootBlock. + // @parentLoop is the parent loop of the Loop or the loop that contains the + // Terminator. + template + ConstBlockSet *computeJoinPoints(const BasicBlock &rootBlock, + SuccessorIterable nodeSuccessors, + const Loop *parentLoop) { + // immediate post dominator (no join block beyond that block) + const auto *pdNode = PDT.getNode(const_cast(&rootBlock)); + const auto *ipdNode = pdNode->getIDom(); + const auto *pdBoundBlock = ipdNode ? ipdNode->getBlock() : nullptr; + + // bootstrap with branch targets + for (const auto *succBlock : nodeSuccessors) { + defMap.emplace(succBlock, succBlock); + + if (parentLoop && !parentLoop->contains(succBlock)) { + // immediate loop exit from node. + reachedLoopExits.insert(succBlock); + continue; + } else { + // regular successor + pendingUpdates.insert(succBlock); + } + } + + auto itBeginRPO = funcRPOT.begin(); + + // skip until term (TODO RPOT won't let us start at @term directly) + for (; *itBeginRPO != &rootBlock; ++itBeginRPO) { + } + + auto itEndRPO = funcRPOT.end(); + assert(itBeginRPO != itEndRPO); + + // propagate definitions at the immediate successors of the node in RPO + auto itBlockRPO = itBeginRPO; + while (++itBlockRPO != itEndRPO && *itBlockRPO != pdBoundBlock) { + const auto *block = *itBlockRPO; + + // skip @block if not pending update + auto itPending = pendingUpdates.find(block); + if (itPending == pendingUpdates.end()) + continue; + pendingUpdates.erase(itPending); + + // propagate definition at @block to its successors + auto itDef = defMap.find(block); + const auto *defBlock = itDef->second; + assert(defBlock); + + auto *blockLoop = LI.getLoopFor(block); + if (parentLoop && + (parentLoop != blockLoop && parentLoop->contains(blockLoop))) { + // if the successor is the header of a nested loop pretend its a + // single node with the loop's exits as successors + SmallVector blockLoopExits; + blockLoop->getExitBlocks(blockLoopExits); + for (const auto *blockLoopExit : blockLoopExits) { + visitSuccessor(*blockLoopExit, parentLoop, *defBlock); + } + + } else { + // the successors are either on the same loop level or loop exits + for (const auto *succBlock : successors(block)) { + visitSuccessor(*succBlock, parentLoop, *defBlock); + } + } + } + + // We need to know the definition at the parent loop header to decide + // whether the definition at the header is different from the definition at + // the loop exits, which would indicate a divergent loop exits. + // + // A // loop header + // | + // B // nested loop header + // | + // C -> X (exit from B loop) -..-> (A latch) + // | + // D -> back to B (B latch) + // | + // proper exit from both loops + // + // D post-dominates B as it is the only proper exit from the "A loop". + // If C has a divergent branch, propagation will therefore stop at D. + // That implies that B will never receive a definition. + // But that definition can only be the same as at D (D itself in thise case) + // because all paths to anywhere have to pass through D. + // + const BasicBlock *parentLoopHeader = + parentLoop ? parentLoop->getHeader() : nullptr; + if (parentLoop && parentLoop->contains(pdBoundBlock)) { + defMap[parentLoopHeader] = defMap[pdBoundBlock]; + } + + // analyze reached loop exits + if (!reachedLoopExits.empty()) { + assert(parentLoop); + const auto *headerDefBlock = defMap[parentLoopHeader]; + LLVM_DEBUG(printDefs(dbgs())); + assert(headerDefBlock && "no definition in header of carrying loop"); + + for (const auto *exitBlock : reachedLoopExits) { + assert((defMap[exitBlock] != nullptr) && + "no reaching def at loop exit"); + if (defMap[exitBlock] != headerDefBlock) { + joinBlocks->insert(exitBlock); + } + } + } + + return joinBlocks; + } +}; + +const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &loop) { + using LoopExitVec = SmallVector; + LoopExitVec loopExits; + loop.getExitBlocks(loopExits); + if (loopExits.size() < 1) { + return emptyBlockSet; + } + + // already available in cache? + auto it = cachedLoopExitJoins.find(&loop); + if (it != cachedLoopExitJoins.end()) + return *it->second; + + // compute all join points + DivergencePropagator propagator{funcRPOT, DT, PDT, LI}; + auto *joinBlocks = propagator.computeJoinPoints( + *loop.getHeader(), loopExits, loop.getParentLoop()); + + cachedLoopExitJoins[&loop] = joinBlocks; + return *joinBlocks; +} + +const ConstBlockSet & +SyncDependenceAnalysis::join_blocks(const TerminatorInst &term) { + // trivial case + if (term.getNumSuccessors() < 1) { + return emptyBlockSet; + } + + // already available in cache? + auto it = cachedBranchJoins.find(&term); + if (it != cachedBranchJoins.end()) + return *it->second; + + // compute all join points + DivergencePropagator propagator{funcRPOT, DT, PDT, LI}; + const auto &termBlock = *term.getParent(); + auto *joinBlocks = propagator.computeJoinPoints( + termBlock, successors(term.getParent()), LI.getLoopFor(&termBlock)); + + cachedBranchJoins[&term] = joinBlocks; + return *joinBlocks; +} + +} // namespace llvm Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -984,7 +984,7 @@ void SelectionDAG::init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, - DivergenceAnalysis * Divergence) { + KernelDivergenceAnalysis * Divergence) { MF = &NewMF; SDAGISelPass = PassPtr; ORE = &NewORE; Index: lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -417,7 +417,7 @@ SplitCriticalSideEffectEdges(const_cast(Fn), DT, LI); CurDAG->init(*MF, *ORE, this, LibInfo, - getAnalysisIfAvailable()); + getAnalysisIfAvailable()); FuncInfo->set(Fn, *MF, CurDAG); // Now get the optional analyzes if we want to. Index: lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -16,7 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" #include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/KernelDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/IR/IRBuilder.h" @@ -32,7 +32,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass, public InstVisitor { - DivergenceAnalysis *DA; + KernelDivergenceAnalysis *DA; MemoryDependenceResults *MDR; LoopInfo *LI; DenseMap noClobberClones; @@ -49,7 +49,7 @@ return "AMDGPU Annotate Uniform Values"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.setPreservesAll(); @@ -64,7 +64,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, @@ -176,7 +176,7 @@ if (skipFunction(F)) return false; - DA = &getAnalysis(); + DA = &getAnalysis(); MDR = &getAnalysis().getMemDep(); LI = &getAnalysis().getLoopInfo(); isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL; Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -18,7 +18,7 @@ #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/KernelDivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Passes.h" @@ -60,7 +60,7 @@ public InstVisitor { const GCNSubtarget *ST = nullptr; AssumptionCache *AC = nullptr; - DivergenceAnalysis *DA = nullptr; + KernelDivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; bool HasUnsafeFPMath = false; AMDGPUAS AMDGPUASI; @@ -177,7 +177,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.setPreservesAll(); } }; @@ -898,7 +898,7 @@ const AMDGPUTargetMachine &TM = TPC->getTM(); ST = &TM.getSubtarget(F); AC = &getAnalysis().getAssumptionCache(F); - DA = &getAnalysis(); + DA = &getAnalysis(); HasUnsafeFPMath = hasUnsafeFPMath(F); AMDGPUASI = TM.getAMDGPUAS(); @@ -918,7 +918,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -29,7 +29,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/KernelDivergenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -87,7 +87,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); SelectionDAGISel::getAnalysisUsage(AU); } @@ -257,7 +257,7 @@ "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) Index: lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -16,7 +16,6 @@ #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" Index: lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -25,7 +25,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/KernelDivergenceAnalysis.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Transforms/Utils/Local.h" @@ -70,7 +70,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) @@ -78,10 +78,10 @@ // TODO: Preserve dominator tree. AU.addRequired(); - AU.addRequired(); + AU.addRequired(); // No divergent values are changed, only blocks and branch edges. - AU.addPreserved(); + AU.addPreserved(); // We preserve the non-critical-edgeness property AU.addPreservedID(BreakCriticalEdgesID); @@ -95,7 +95,7 @@ /// \returns true if \p BB is reachable through only uniform branches. /// XXX - Is there a more efficient way to find this? -static bool isUniformlyReached(const DivergenceAnalysis &DA, +static bool isUniformlyReached(const KernelDivergenceAnalysis &DA, BasicBlock &BB) { SmallVector Stack; SmallPtrSet Visited; @@ -163,7 +163,7 @@ if (PDT.getRoots().size() <= 1) return false; - DivergenceAnalysis &DA = getAnalysis(); + KernelDivergenceAnalysis &DA = getAnalysis(); // Loop over all of the blocks in a function, tracking all of the blocks that // return. Index: lib/Target/AMDGPU/SIAnnotateControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -16,7 +16,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/KernelDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" @@ -52,7 +52,7 @@ using StackVector = SmallVector; class SIAnnotateControlFlow : public FunctionPass { - DivergenceAnalysis *DA; + KernelDivergenceAnalysis *DA; Type *Boolean; Type *Void; @@ -116,7 +116,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addPreserved(); FunctionPass::getAnalysisUsage(AU); } @@ -127,7 +127,7 @@ INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis) INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) @@ -386,7 +386,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { DT = &getAnalysis().getDomTree(); LI = &getAnalysis().getLoopInfo(); - DA = &getAnalysis(); + DA = &getAnalysis(); for (df_iterator I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -324,7 +324,7 @@ unsigned Depth = 0) const override; bool isSDNodeSourceOfDivergence(const SDNode *N, - FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override; + FunctionLoweringInfo *FLI, KernelDivergenceAnalysis *DA) const override; bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -8486,7 +8486,7 @@ } bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, - FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const + FunctionLoweringInfo * FLI, KernelDivergenceAnalysis * KDA) const { switch (N->getOpcode()) { case ISD::Register: @@ -8519,7 +8519,7 @@ else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) return true; } - return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg)); + return !KDA || KDA->isDivergent(FLI->getValueFromVirtualReg(Reg)); } } break; Index: lib/Transforms/Scalar/LoopUnswitch.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnswitch.cpp +++ lib/Transforms/Scalar/LoopUnswitch.cpp @@ -33,7 +33,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/KernelDivergenceAnalysis.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -215,7 +215,7 @@ AU.addRequired(); AU.addRequired(); if (hasBranchDivergence) - AU.addRequired(); + AU.addRequired(); getLoopAnalysisUsage(AU); } @@ -383,7 +383,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis) INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) @@ -864,7 +864,7 @@ return false; } if (hasBranchDivergence && - getAnalysis().isDivergent(LoopCond)) { + getAnalysis().isDivergent(LoopCond)) { LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << currentLoop->getHeader()->getName() << " at non-trivial condition '" << *Val Index: lib/Transforms/Scalar/StructurizeCFG.cpp =================================================================== --- lib/Transforms/Scalar/StructurizeCFG.cpp +++ lib/Transforms/Scalar/StructurizeCFG.cpp @@ -13,7 +13,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/KernelDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" @@ -183,7 +183,7 @@ Function *Func; Region *ParentRegion; - DivergenceAnalysis *DA; + KernelDivergenceAnalysis *DA; DominatorTree *DT; LoopInfo *LI; @@ -269,7 +269,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { if (SkipUniformRegions) - AU.addRequired(); + AU.addRequired(); AU.addRequiredID(LowerSwitchID); AU.addRequired(); AU.addRequired(); @@ -285,7 +285,7 @@ INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis) INITIALIZE_PASS_DEPENDENCY(LowerSwitch) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) @@ -914,7 +914,7 @@ } static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID, - const DivergenceAnalysis &DA) { + const KernelDivergenceAnalysis &DA) { for (auto E : R->elements()) { if (!E->isSubRegion()) { auto Br = dyn_cast(E->getEntry()->getTerminator()); @@ -962,7 +962,7 @@ // but we shouldn't rely on metadata for correctness! unsigned UniformMDKindID = R->getEntry()->getContext().getMDKindID("structurizecfg.uniform"); - DA = &getAnalysis(); + DA = &getAnalysis(); if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) { LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R Index: test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll @@ -0,0 +1,14 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-da %s | FileCheck %s + +define amdgpu_kernel void @workitem_id_x() #1 { + %id.x = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x) +; CHECK-NOT: DIVERGENT: %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.readfirstlane(i32) #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-da %s | FileCheck %s ; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst define i32 @test1(i32* %ptr, i32 %val) #0 { Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll @@ -0,0 +1,26 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-da %s | FileCheck %s + +define amdgpu_kernel void @hidden_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'hidden_diverge' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %cond.var = icmp slt i32 %tid, 0 + br i1 %cond.var, label %B, label %C ; divergent +; CHECK: DIVERGENT: br i1 %cond.var, +B: + %cond.uni = icmp slt i32 %n, 0 + br i1 %cond.uni, label %C, label %merge ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond.uni, +C: + %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ] +; CHECK: DIVERGENT: %phi.var.hidden = phi i32 + br label %merge +merge: + %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ] +; CHECK: DIVERGENT: %phi.ipd = phi i32 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll @@ -0,0 +1,224 @@ +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-da %s | FileCheck %s + +; divergent loop (H
, B) +; the divergent join point in %exit is obscured by uniform control joining in %X +define amdgpu_kernel void @hidden_loop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'hidden_loop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br i1 %uni.cond, label %X, label %H ; uniform + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ] + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %B ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +B: + %uni.inc = add i32 %uni.merge.h, 1 + %div.exity = icmp sgt i32 %tid, 0 + br i1 %div.exity, label %Y, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exity = +; CHECK: DIVERGENT: br i1 %div.exity, + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %H ] ; temporal divergent phi + br i1 %uni.cond, label %Y, label %exit +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ] + br label %exit +; CHECK: DIVERGENT: %div.merge.y = + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (H
, B) +; the phi nodes in X and Y don't actually receive divergent values +define amdgpu_kernel void @unobserved_loop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'unobserved_loop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + br i1 %uni.cond, label %X, label %H ; uniform + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ] + %div.exitx = icmp slt i32 %tid, 0 + br i1 %div.exitx, label %X, label %B ; divergent branch +; CHECK: DIVERGENT: %div.exitx = +; CHECK: DIVERGENT: br i1 %div.exitx, + +B: + %uni.inc = add i32 %uni.merge.h, 1 + %div.exity = icmp sgt i32 %tid, 0 + br i1 %div.exity, label %Y, label %H ; divergent branch +; CHECK: DIVERGENT: %div.exity = +; CHECK: DIVERGENT: br i1 %div.exity, + +X: + %uni.merge.x = phi i32 [ %a, %entry ], [ %b, %H ] + br label %exit + +Y: + %uni.merge.y = phi i32 [ %b, %B ] + br label %exit + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (G
, L) inside divergent loop (H
, B, C, D, G, L) +; the inner loop has no exit to top level. +; the outer loop becomes divergent as its exiting branch in C is control-dependent on the inner loop's divergent loop exit in D. +define amdgpu_kernel void @hidden_nestedloop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'hidden_nestedloop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %D ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + br i1 %uni.cond, label %X, label %C + +C: + br i1 %uni.cond, label %Y, label %D + +D: + %uni.inc = add i32 %uni.merge.h, 1 + br label %H + +G: + br i1 %div.exitx, label %C, label %L +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + br i1 %uni.cond, label %D, label %G + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %B ] ; temporal divergent phi + br i1 %uni.cond, label %Y, label %exit +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %C ] + br label %exit +; CHECK: DIVERGENT: %div.merge.y = + +exit: + %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ] + ret void +; CHECK: DIVERGENT: %div.merge.exit = +} + +; divergent loop (G
, L) in divergent loop (H
, B, C, G, L) +; the outer loop has no immediately divergent exiting edge. +; the inner exiting edge is exiting to top-level through the outer loop causing both to become divergent. +define amdgpu_kernel void @hidden_doublebreak_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'hidden_doublebreak_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %C ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + br i1 %uni.cond, label %Y, label %C + +C: + %uni.inc = add i32 %uni.merge.h, 1 + br label %H + +G: + br i1 %div.exitx, label %X, label %L ; two-level break +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + br i1 %uni.cond, label %C, label %G + +X: + %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %G ] ; temporal divergence + br label %Y +; CHECK: DIVERGENT: %div.merge.x = + +Y: + %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ] + ret void +; CHECK: DIVERGENT: %div.merge.y = +} + +; divergent loop (G
, L) contained inside a uniform loop (H
, B, G, L , D) +define amdgpu_kernel void @hidden_containedloop_diverge(i32 %n, i32 %a, i32 %b) #0 { +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'hidden_containedloop_diverge': +; CHECK-NOT: DIVERGENT: %uni. +; CHECK-NOT: DIVERGENT: br i1 %uni. + +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 + %div.exitx = icmp slt i32 %tid, 0 + br i1 %uni.cond, label %X, label %H + +H: + %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc.d, %D ] + br i1 %uni.cond, label %G, label %B +; CHECK: DIVERGENT: %div.exitx = + +B: + %div.merge.b = phi i32 [ 42, %H ], [ %uni.merge.g, %G ] + br label %D +; CHECK: DIVERGENT: %div.merge.b = + +G: + %uni.merge.g = phi i32 [ 123, %H ], [ %uni.inc.l, %L ] + br i1 %div.exitx, label %B, label %L +; CHECK: DIVERGENT: br i1 %div.exitx, + +L: + %uni.inc.l = add i32 %uni.merge.g, 1 + br i1 %uni.cond, label %G, label %D + +D: + %uni.inc.d = add i32 %uni.merge.h, 1 + br i1 %uni.cond, label %X, label %H + +X: + %uni.merge.x = phi i32 [ %a, %entry ], [ %uni.inc.d, %D ] + ret void + +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } Index: test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-da %s | FileCheck %s ; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 { Index: test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll @@ -1,6 +1,6 @@ -; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-da | FileCheck %s -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_ps': +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'test_amdgpu_ps': ; CHECK: DIVERGENT: ; CHECK-NOT: %arg0 ; CHECK-NOT: %arg1 @@ -14,7 +14,7 @@ ret void } -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_kernel': +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'test_amdgpu_kernel': ; CHECK-NOT: %arg0 ; CHECK-NOT: %arg1 ; CHECK-NOT: %arg2 @@ -26,7 +26,7 @@ ret void } -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_c': +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'test_c': ; CHECK: DIVERGENT: ; CHECK: DIVERGENT: ; CHECK: DIVERGENT: Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -1,4 +1,4 @@ -;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence %s | FileCheck %s +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-da %s | FileCheck %s ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap( define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll @@ -1,4 +1,4 @@ -;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence %s | FileCheck %s +;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-da %s | FileCheck %s ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32( define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 { Index: test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll @@ -1,4 +1,4 @@ -; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s +; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-da | FileCheck %s ; CHECK: DIVERGENT: %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2 ; CHECK: DIVERGENT: %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4 Index: test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-da %s | FileCheck %s ; CHECK-LABEL: 'test1': ; CHECK-NEXT: DIVERGENT: i32 %bound Index: test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll @@ -1,17 +0,0 @@ -; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s - -; CHECK: DIVERGENT: %tmp = cmpxchg volatile -define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 { -entry: - unreachable - -unreachable_loop: ; preds = %do.body.i, %if.then11 - %tmp = cmpxchg volatile i32 addrspace(1)* null, i32 0, i32 0 seq_cst seq_cst - %cmp.i = extractvalue { i32, i1 } %tmp, 1 - br i1 %cmp.i, label %unreachable_loop, label %end - -end: ; preds = %do.body.i51, %atomicAdd_g_f.exit - unreachable -} - -attributes #0 = { norecurse nounwind } Index: test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence %s | FileCheck %s +; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-da %s | FileCheck %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 Index: test/Analysis/DivergenceAnalysis/Loops/IndirectUniAccess.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/IndirectUniAccess.ll @@ -0,0 +1,74 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv29 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next30, %for.cond.cleanup3 ] +; CHECK-NEXT: DIVERGENT: %x.0.lcssa = phi double [ 0.000000e+00, %for.body ], [ %add, %for.body4 ] +; CHECK-NEXT: DIVERGENT: %arrayidx10 = getelementptr inbounds double, double* %C, i64 %indvars.iv29 +; CHECK-NEXT: DIVERGENT: store double %x.0.lcssa, double* %arrayidx10, align 8 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1 +; CHECK-NEXT: DIVERGENT: %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ] +; CHECK-NEXT: DIVERGENT: %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29 +; CHECK-NEXT: DIVERGENT: %2 = load double, double* %arrayidx8, align 8 +; CHECK-NEXT: DIVERGENT: %add = fadd double %x.025, %2 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body4 { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ] +; CHECK-NEXT: DIVERGENT: %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds i32, i32* %Index, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: %0 = load i32, i32* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %idxprom5 = sext i32 %0 to i64 +; CHECK-NEXT: DIVERGENT: %arrayidx6 = getelementptr inbounds double*, double** %A, i64 %idxprom5 +; CHECK-NEXT: DIVERGENT: %1 = load double*, double** %arrayidx6, align 8 +; CHECK-NEXT: DIVERGENT: %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29 +; CHECK-NEXT: DIVERGENT: %2 = load double, double* %arrayidx8, align 8 +; CHECK-NEXT: DIVERGENT: %add = fadd double %x.025, %2 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } + +; Function Attrs: norecurse nounwind uwtable +define void @test(i32* nocapture readonly %Index, double** nocapture readonly %A, double* nocapture %C, i32 %m, i32 %n) #0 { +entry: + %cmp27 = icmp sgt i32 %n, 0 + br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %cmp224 = icmp sgt i32 %m, 0 + %wide.trip.count = zext i32 %m to i64 + %wide.trip.count31 = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.body: ; preds = %for.cond.cleanup3, %for.body.lr.ph + %indvars.iv29 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next30, %for.cond.cleanup3 ] + br i1 %cmp224, label %for.body4.preheader, label %for.cond.cleanup3 + +for.body4.preheader: ; preds = %for.body + br label %for.body4 + +for.cond.cleanup3: ; preds = %for.body4, %for.body + %x.0.lcssa = phi double [ 0.000000e+00, %for.body ], [ %add, %for.body4 ] + %arrayidx10 = getelementptr inbounds double, double* %C, i64 %indvars.iv29 + store double %x.0.lcssa, double* %arrayidx10, align 8 + %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1 + %exitcond32 = icmp eq i64 %indvars.iv.next30, %wide.trip.count31 + br i1 %exitcond32, label %for.cond.cleanup, label %for.body + +for.body4: ; preds = %for.body4.preheader, %for.body4 + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ] + %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %Index, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %idxprom5 = sext i32 %0 to i64 + %arrayidx6 = getelementptr inbounds double*, double** %A, i64 %idxprom5 + %1 = load double*, double** %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29 + %2 = load double, double* %arrayidx8, align 8 + %add = fadd double %x.025, %2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup3, label %for.body4 +} + +attributes #0 = { norecurse nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithDivBranch.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithDivBranch.ll @@ -0,0 +1,44 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] +; CHECK-NEXT: DIVERGENT: %hfreq = srem i64 %indvars.iv, 2 +; CHECK-NEXT: DIVERGENT: %toggle = trunc i64 %hfreq to i1 +; CHECK-NEXT: DIVERGENT: br i1 %toggle, label %A, label %B +; CHECK-NEXT: DIVERGENT: %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float %divphi, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] + %hfreq = srem i64 %indvars.iv, 2 + %toggle = trunc i64 %hfreq to i1 + br i1 %toggle, label %A, label %B + +A: + %trunc = trunc i64 %n to i32 + %cast = sitofp i32 %trunc to float + br label %B + +B: + %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ] + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv + store float %divphi, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithDivLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithDivLoop.ll @@ -0,0 +1,60 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Printing analysis 'Loop Divergence Printer' for function 'test1': +; CHECK-NEXT: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] +; CHECK-NEXT: DIVERGENT: %row = mul i64 %n, %indvars.iv +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %exitcond2 = icmp sge i64 %indvars.iv.next2, %indvars.iv +; CHECK-NEXT: DIVERGENT: br i1 %exitcond2, label %for.latch, label %for.body2 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body2 { +; CHECK-NEXT: DIVERGENT: %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 +; CHECK-NEXT: } + +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %exit + +for.body.lr.ph: ; preds = %entry + br label %for.body + +exit: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] + br label %for.body2 + +for.body2: + %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] + %row = mul i64 %n, %indvars.iv + %idx = add i64 %row, %indvars.iv2 + %trunc = trunc i64 %idx to i32 + %val = sitofp i32 %trunc to float + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx + store float %val, float* %arrayidx, align 4 + %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 + %exitcond2 = icmp sge i64 %indvars.iv.next2, %indvars.iv + br i1 %exitcond2, label %for.latch, label %for.body2 + +for.latch: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %exit, label %for.body +} + +attributes #0 = { nounwind } + Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithLI.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithLI.ll @@ -0,0 +1,31 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float %cast, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %A, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + %trunc = trunc i64 %n to i32 + %cast = sitofp i32 %trunc to float + store float %cast, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithUniBranch.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithUniBranch.ll @@ -0,0 +1,39 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float %divphi, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ] + %invar = trunc i64 %n to i1 + br i1 %invar, label %A, label %B + +A: + %trunc = trunc i64 %n to i32 + %cast = sitofp i32 %trunc to float + br label %B + +B: + %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ] + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv + store float %divphi, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithUniLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/LoopWithUniLoop.ll @@ -0,0 +1,55 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] +; CHECK-NEXT: DIVERGENT: %row = mul i64 %n, %indvars.iv +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body2 { +; CHECK-NEXT: DIVERGENT: %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] +; CHECK-NEXT: DIVERGENT: %idx = add i64 %row, %indvars.iv2 +; CHECK-NEXT: DIVERGENT: %trunc = trunc i64 %idx to i32 +; CHECK-NEXT: DIVERGENT: %val = sitofp i32 %trunc to float +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx +; CHECK-NEXT: DIVERGENT: store float %val, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %ptr, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %exit + +for.body.lr.ph: ; preds = %entry + br label %for.body + +exit: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ] + br label %for.body2 + +for.body2: + %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ] + %row = mul i64 %n, %indvars.iv + %idx = add i64 %row, %indvars.iv2 + %trunc = trunc i64 %idx to i32 + %val = sitofp i32 %trunc to float + %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx + store float %val, float* %arrayidx, align 4 + %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1 + %exitcond2 = icmp eq i64 %indvars.iv.next2, %n + br i1 %exitcond2, label %for.latch, label %for.body2 + +for.latch: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %exit, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/Loops/NonAffineUniLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/NonAffineUniLoop.ll @@ -0,0 +1,110 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv53 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next54, %for.cond.cleanup3 ] +; CHECK-NEXT: DIVERGENT: %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1 +; CHECK-NEXT: DIVERGENT: %5 = add nsw i64 %4, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds double, double* %A, i64 %5 +; CHECK-NEXT: DIVERGENT: %6 = load double, double* %arrayidx, align 8 +; CHECK-NEXT: DIVERGENT: %8 = add nsw i64 %7, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 +; CHECK-NEXT: DIVERGENT: %9 = load double, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %add15 = fadd double %6, %9 +; CHECK-NEXT: DIVERGENT: store double %add15, double* %arrayidx14, align 8 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body8.lr.ph { +; CHECK-NEXT: DIVERGENT: %mul44 = phi i32 [ %mul, %for.cond.cleanup7 ], [ 2, %for.body8.lr.ph.preheader ] +; CHECK-NEXT: DIVERGENT: %len.043 = phi i32 [ %mul44, %for.cond.cleanup7 ], [ 1, %for.body8.lr.ph.preheader ] +; CHECK-NEXT: DIVERGENT: %1 = sext i32 %mul44 to i64 +; CHECK-NEXT: DIVERGENT: %2 = sext i32 %len.043 to i64 +; CHECK-NEXT: DIVERGENT: %mul = shl nsw i32 %mul44, 1 +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ] +; CHECK-NEXT: DIVERGENT: %3 = add nsw i64 %indvars.iv, %2 +; CHECK-NEXT: DIVERGENT: %4 = mul nsw i64 %3, %0 +; CHECK-NEXT: DIVERGENT: %5 = add nsw i64 %4, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds double, double* %A, i64 %5 +; CHECK-NEXT: DIVERGENT: %6 = load double, double* %arrayidx, align 8 +; CHECK-NEXT: DIVERGENT: %7 = mul nsw i64 %indvars.iv, %0 +; CHECK-NEXT: DIVERGENT: %8 = add nsw i64 %7, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 +; CHECK-NEXT: DIVERGENT: %9 = load double, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %add15 = fadd double %6, %9 +; CHECK-NEXT: DIVERGENT: store double %add15, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add i64 %indvars.iv, %1 +; CHECK-NEXT: DIVERGENT: %cmp6 = icmp slt i64 %indvars.iv.next, %0 +; CHECK-NEXT: DIVERGENT: br i1 %cmp6, label %for.body8, label %for.cond.cleanup7 +; CHECK-NEXT: } +; CHECK: Divergence of loop for.body8 { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ] +; CHECK-NEXT: DIVERGENT: %3 = add nsw i64 %indvars.iv, %2 +; CHECK-NEXT: DIVERGENT: %4 = mul nsw i64 %3, %0 +; CHECK-NEXT: DIVERGENT: %5 = add nsw i64 %4, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds double, double* %A, i64 %5 +; CHECK-NEXT: DIVERGENT: %6 = load double, double* %arrayidx, align 8 +; CHECK-NEXT: DIVERGENT: %7 = mul nsw i64 %indvars.iv, %0 +; CHECK-NEXT: DIVERGENT: %8 = add nsw i64 %7, %indvars.iv53 +; CHECK-NEXT: DIVERGENT: %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 +; CHECK-NEXT: DIVERGENT: %9 = load double, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %add15 = fadd double %6, %9 +; CHECK-NEXT: DIVERGENT: store double %add15, double* %arrayidx14, align 8 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add i64 %indvars.iv, %1 +; CHECK-NEXT: } + +; Function Attrs: norecurse nounwind uwtable +define void @foo(double* nocapture %A, i32 %n) local_unnamed_addr #0 { +entry: + %cmp45 = icmp sgt i32 %n, 0 + br i1 %cmp45, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %cmp242 = icmp sgt i32 %n, 2 + %0 = sext i32 %n to i64 + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.body: ; preds = %for.cond.cleanup3, %for.body.lr.ph + %indvars.iv53 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next54, %for.cond.cleanup3 ] + br i1 %cmp242, label %for.body8.lr.ph.preheader, label %for.cond.cleanup3 + +for.body8.lr.ph.preheader: ; preds = %for.body + br label %for.body8.lr.ph + +for.cond.cleanup3: ; preds = %for.cond.cleanup7, %for.body + %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1 + %exitcond = icmp eq i64 %indvars.iv.next54, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.body8.lr.ph: ; preds = %for.body8.lr.ph.preheader, %for.cond.cleanup7 + %mul44 = phi i32 [ %mul, %for.cond.cleanup7 ], [ 2, %for.body8.lr.ph.preheader ] + %len.043 = phi i32 [ %mul44, %for.cond.cleanup7 ], [ 1, %for.body8.lr.ph.preheader ] + %1 = sext i32 %mul44 to i64 + %2 = sext i32 %len.043 to i64 + br label %for.body8 + +for.cond.cleanup7: ; preds = %for.body8 + %mul = shl nsw i32 %mul44, 1 + %cmp2 = icmp slt i32 %mul, %n + br i1 %cmp2, label %for.body8.lr.ph, label %for.cond.cleanup3 + +for.body8: ; preds = %for.body8.lr.ph, %for.body8 + %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ] + %3 = add nsw i64 %indvars.iv, %2 + %4 = mul nsw i64 %3, %0 + %5 = add nsw i64 %4, %indvars.iv53 + %arrayidx = getelementptr inbounds double, double* %A, i64 %5 + %6 = load double, double* %arrayidx, align 8 + %7 = mul nsw i64 %indvars.iv, %0 + %8 = add nsw i64 %7, %indvars.iv53 + %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8 + %9 = load double, double* %arrayidx14, align 8 + %add15 = fadd double %6, %9 + store double %add15, double* %arrayidx14, align 8 + %indvars.iv.next = add i64 %indvars.iv, %1 + %cmp6 = icmp slt i64 %indvars.iv.next, %0 + br i1 %cmp6, label %for.body8, label %for.cond.cleanup7 +} + +attributes #0 = { norecurse nounwind uwtable } Index: test/Analysis/DivergenceAnalysis/Loops/SingleBlockLoop.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/Loops/SingleBlockLoop.ll @@ -0,0 +1,29 @@ +; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s + +; CHECK: Divergence of loop for.body { +; CHECK-NEXT: DIVERGENT: %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: DIVERGENT: %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv +; CHECK-NEXT: DIVERGENT: store float 4.200000e+01, float* %arrayidx, align 4 +; CHECK-NEXT: DIVERGENT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: } +define void @test1(float* nocapture %A, i64 %n) #0 { + entry: + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float 4.200000e+01, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind } Index: test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll @@ -0,0 +1,47 @@ +; RUN: opt %s -analyze -divergence -use-gpu-da | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define i32 @daorder(i32 %n) { +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'daorder' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond = icmp slt i32 %tid, 0 + br i1 %cond, label %A, label %B ; divergent +; CHECK: DIVERGENT: br i1 %cond, +A: + %defAtA = add i32 %n, 1 ; uniform +; CHECK-NOT: DIVERGENT: %defAtA = + br label %C +B: + %defAtB = add i32 %n, 2 ; uniform +; CHECK-NOT: DIVERGENT: %defAtB = + br label %C +C: + %defAtC = phi i32 [ %defAtA, %A ], [ %defAtB, %B ] ; divergent +; CHECK: DIVERGENT: %defAtC = + br label %D + +D: + %i = phi i32 [0, %C], [ %i.inc, %E ] ; uniform +; CHECK-NOT: DIVERGENT: %i = phi + br label %E + +E: + %i.inc = add i32 %i, 1 + %loopCnt = icmp slt i32 %i.inc, %n +; CHECK-NOT: DIVERGENT: %loopCnt = + br i1 %loopCnt, label %D, label %exit + +exit: + ret i32 %n +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() +declare i32 @llvm.nvvm.read.ptx.sreg.laneid() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i32)* @daorder, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll =================================================================== --- test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll +++ test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll @@ -1,11 +1,11 @@ -; RUN: opt %s -analyze -divergence | FileCheck %s +; RUN: opt %s -analyze -divergence -use-gpu-da | FileCheck %s target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" ; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'no_diverge' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'no_diverge' entry: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() %cond = icmp slt i32 %n, 0 @@ -27,7 +27,7 @@ ; c = b; ; return c; // c is divergent: sync dependent define i32 @sync(i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'sync' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'sync' bb1: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() %cond = icmp slt i32 %tid, 5 @@ -48,7 +48,7 @@ ; // c here is divergent because it is sync dependent on threadIdx.x >= 5 ; return c; define i32 @mixed(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'mixed' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'mixed' bb1: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() %cond = icmp slt i32 %tid, 5 @@ -73,7 +73,7 @@ ; We conservatively treats all parameters of a __device__ function as divergent. define i32 @device(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'device' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'device' ; CHECK: DIVERGENT: i32 %n ; CHECK: DIVERGENT: i32 %a ; CHECK: DIVERGENT: i32 %b @@ -98,7 +98,7 @@ ; ; The i defined in the loop is used outside. define i32 @loop() { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'loop' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'loop' entry: %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid() br label %loop @@ -120,7 +120,7 @@ ; Same as @loop, but the loop is in the LCSSA form. define i32 @lcssa() { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'lcssa' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'lcssa' entry: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() br label %loop @@ -142,49 +142,6 @@ ret i32 1 } -; This test contains an unstructured loop. -; +-------------- entry ----------------+ -; | | -; V V -; i1 = phi(0, i3) i2 = phi(0, i3) -; j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2 -; ^ | ^ -; | V | -; +-------- switch (tid / i3) ----------+ -; | -; V -; if (i3 == 5) // divergent -; because sync dependent on (tid / i3). -define i32 @unstructured_loop(i1 %entry_cond) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'unstructured_loop' -entry: - %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 -loop_entry_1: - %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] - %j1 = add i32 %i1, 1 - br label %loop_body -loop_entry_2: - %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ] - %j2 = add i32 %i2, 2 - br label %loop_body -loop_body: - %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ] - br label %loop_latch -loop_latch: - %div = sdiv i32 %tid, %i3 - switch i32 %div, label %branch [ i32 1, label %loop_entry_1 - i32 2, label %loop_entry_2 ] -branch: - %cmp = icmp eq i32 %i3, 5 - br i1 %cmp, label %then, label %else -; CHECK: DIVERGENT: br i1 %cmp, -then: - ret i32 0 -else: - ret i32 1 -} - ; Verifies sync-dependence is computed correctly in the absense of loops. define i32 @sync_no_loop(i32 %arg) { entry: @@ -210,10 +167,9 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() declare i32 @llvm.nvvm.read.ptx.sreg.laneid() -!nvvm.annotations = !{!0, !1, !2, !3, !4, !5} +!nvvm.annotations = !{!0, !1, !2, !3, !4} !0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1} !1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1} !2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1} !3 = !{i32 ()* @loop, !"kernel", i32 1} -!4 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1} -!5 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} +!4 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1} Index: test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll =================================================================== --- /dev/null +++ test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll @@ -0,0 +1,30 @@ +; RUN: opt %s -analyze -divergence -use-gpu-da | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) { +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'hidden_diverge' +entry: + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %cond.var = icmp slt i32 %tid, 0 + br i1 %cond.var, label %B, label %C ; divergent +; CHECK: DIVERGENT: br i1 %cond.var, +B: + %cond.uni = icmp slt i32 %n, 0 + br i1 %cond.uni, label %C, label %merge ; uniform +; CHECK-NOT: DIVERGENT: br i1 %cond.uni, +C: + %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ] +; CHECK: DIVERGENT: %phi.var.hidden = phi i32 + br label %merge +merge: + %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ] +; CHECK: DIVERGENT: %phi.ipd = phi i32 + ret i32 %phi.ipd +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() + +!nvvm.annotations = !{!0} +!0 = !{i32 (i32, i32, i32)* @hidden_diverge, !"kernel", i32 1} Index: test/Analysis/KernelDivergenceAnalysis/AMDGPU/kernel-args.ll =================================================================== --- test/Analysis/KernelDivergenceAnalysis/AMDGPU/kernel-args.ll +++ test/Analysis/KernelDivergenceAnalysis/AMDGPU/kernel-args.ll @@ -1,6 +1,6 @@ ; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_ps': +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'test_amdgpu_ps': ; CHECK: DIVERGENT: ; CHECK-NOT: %arg0 ; CHECK-NOT: %arg1 @@ -14,7 +14,7 @@ ret void } -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_kernel': +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'test_amdgpu_kernel': ; CHECK-NOT: %arg0 ; CHECK-NOT: %arg1 ; CHECK-NOT: %arg2 @@ -26,7 +26,7 @@ ret void } -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_c': +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'test_c': ; CHECK: DIVERGENT: ; CHECK: DIVERGENT: ; CHECK: DIVERGENT: Index: test/Analysis/KernelDivergenceAnalysis/AMDGPU/lit.local.cfg =================================================================== --- /dev/null +++ test/Analysis/KernelDivergenceAnalysis/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True Index: test/Analysis/KernelDivergenceAnalysis/NVPTX/diverge.ll =================================================================== --- test/Analysis/KernelDivergenceAnalysis/NVPTX/diverge.ll +++ test/Analysis/KernelDivergenceAnalysis/NVPTX/diverge.ll @@ -5,7 +5,7 @@ ; return (n < 0 ? a + threadIdx.x : b + threadIdx.x) define i32 @no_diverge(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'no_diverge' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'no_diverge' entry: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() %cond = icmp slt i32 %n, 0 @@ -27,7 +27,7 @@ ; c = b; ; return c; // c is divergent: sync dependent define i32 @sync(i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'sync' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'sync' bb1: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() %cond = icmp slt i32 %tid, 5 @@ -48,7 +48,7 @@ ; // c here is divergent because it is sync dependent on threadIdx.x >= 5 ; return c; define i32 @mixed(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'mixed' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'mixed' bb1: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() %cond = icmp slt i32 %tid, 5 @@ -73,7 +73,7 @@ ; We conservatively treats all parameters of a __device__ function as divergent. define i32 @device(i32 %n, i32 %a, i32 %b) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'device' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'device' ; CHECK: DIVERGENT: i32 %n ; CHECK: DIVERGENT: i32 %a ; CHECK: DIVERGENT: i32 %b @@ -98,7 +98,7 @@ ; ; The i defined in the loop is used outside. define i32 @loop() { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'loop' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'loop' entry: %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid() br label %loop @@ -120,7 +120,7 @@ ; Same as @loop, but the loop is in the LCSSA form. define i32 @lcssa() { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'lcssa' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'lcssa' entry: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() br label %loop @@ -156,7 +156,7 @@ ; if (i3 == 5) // divergent ; because sync dependent on (tid / i3). define i32 @unstructured_loop(i1 %entry_cond) { -; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'unstructured_loop' +; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'unstructured_loop' entry: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2 Index: test/Analysis/KernelDivergenceAnalysis/NVPTX/lit.local.cfg =================================================================== --- /dev/null +++ test/Analysis/KernelDivergenceAnalysis/NVPTX/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'NVPTX' in config.root.targets: + config.unsupported = True